State Farm Distracted Drivers

Prev Exercises: Udacity:DeepLearning:TensorFlow:notMNIST

Baseline

notMNIST: This notebook uses the notMNIST dataset to be used with python experiments. This dataset is designed to look like the classic MNIST dataset, while looking a little more like real data: it's a harder task, and the data is a lot less 'clean' than MNIST.

In [2]:
import sys
print sys.version

from joblib import Parallel, delayed  
import multiprocessing
nCores = multiprocessing.cpu_count() - 2 # Allow other apps to run
print 'nCores: %d' % (nCores)

from datetime import datetime, time
print 'now: %s' % str(datetime.now())
2.7.11 (default, Jan 28 2016, 14:07:46) 
[GCC 4.2.1 Compatible Apple LLVM 7.0.2 (clang-700.1.81)]
nCores: 14
now: 2016-05-13 05:58:08.947371
In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display, Image
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects.lib import grid
from rpy2.robjects.lib import ggplot2
import rpy2.robjects.pandas2ri

import numpy as np
import os
import pandas as pd
from scipy import ndimage
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from skimage import color as sk_color
from skimage import io as sk_io
from skimage import transform as sk_transform
import tarfile

%run img_utils.py
/usr/local/lib/python2.7/site-packages/rpy2/robjects/lib/ggplot2.py:59: UserWarning: This was designed againt ggplot2 version 2.0.0 but you have 2.1.0
  warnings.warn('This was designed againt ggplot2 version %s but you have %s' % (TARGET_VERSION, ggplot2.__version__))

Analytics Specs

This Project

The specs should be in img_glbSpecs_SFDD

In [4]:
%run img_glbSpec_SFDD_ImgSz_64.py
imported img_glbSpec_SFDD_Img_Sz_64.py
In [5]:
#print 'glbDataFile: %s' % (glbDataFile)

print 'glbImg: %s' % (glbImg)

print 'glbRspClass: %s' % (glbRspClass)
print 'glbRspClassN: %d' % (glbRspClassN)

print 'glbPickleFile: %s' % (glbPickleFile)
glbImg: {'color': False, 'crop': {'x': (80, 560)}, 'shape': (480, 640, 3), 'pxlDepth': 255.0, 'center_scale': True, 'size': 64}
glbRspClass: ['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9']
glbRspClassN: 10
glbPickleFile: {'models': 'data/img_M_SFDD_ImgSz_64.pickle', 'data': 'data/img_D_SFDD_ImgSz_64.pickle'}

notMNIST

In [6]:
# glbDataURL = 'http://yaroslavvb.com/upload/notMNIST/'
# glbImg['size'] = 32

Import Data

First, we'll download the dataset to our local machine.

In [8]:
def maybe_download(url, filename, expected_bytes = None):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists('data/' + filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat('data/' + filename)
  verified = False
  if (expected_bytes == None):
    if (statinfo.st_size > 0):
        verified = True
  else:      
    if (statinfo.st_size == expected_bytes):
        verified = True
    
  if verified:
    print('Found and verified', 'data/' + filename)
  else:
    raise Exception(
      'Failed to verify' + filename + '. Can you get to it with a browser?')
  return 'data/' + filename

dataFNm = maybe_download(glbDataFile['url'], glbDataFile['filename'])
('Found and verified', 'data/imgs.zip')
In [9]:
# url = 'http://yaroslavvb.com/upload/notMNIST/'

# def maybe_download(url, filename, expected_bytes):
#   """Download a file if not present, and make sure it's the right size."""
#   if not os.path.exists(filename):
#     filename, _ = urlretrieve(url + filename, filename)
#   statinfo = os.stat(filename)
#   if statinfo.st_size == expected_bytes:
#     print('Found and verified', filename)
#   else:
#     raise Exception(
#       'Failed to verify' + filename + '. Can you get to it with a browser?')
#   return filename

# train_filename = maybe_download('data/notMNIST_large.tar.gz', 247336696)
# test_filename = maybe_download('data/notMNIST_small.tar.gz', 8458043)

Extract the dataset from the compressed downloaded file(s).

In [10]:
def extract(filename, num_classes):
  print("Figure out automatically if data needs to be extracted")
  return
    
  tar = tarfile.open(filename)
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  print('Extracting data for %s. This may take a while. Please wait.' % root)
  sys.stdout.flush()
  tar.extractall()
  tar.close()
  # My edits: data_folders needs to be modified for the correct path
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root)) if d != '.DS_Store']
  if len(data_folders) != num_classes:
    raise Exception(
      'Expected %d folders, one per class. Found %d instead.' % (
        num_classes, len(data_folders)))
  print(data_folders)
  return data_folders

if (glbDataFile['extract']):
    train_folders = extract(os.getcwd() + train_filename, glbRspClassN)
    test_folders  = extract(os.getcwd() + test_filename , glbRspClassN)
In [5]:
driverDf = pd.read_csv('data/driver_imgs_list.csv')
print driverDf.describe()
# print driverDf.shape
print driverDf.head()
print driverDf.tail()
print '\n subject knts:'
print driverDf['subject'].value_counts().sort_values()
       subject classname            img
count    22424     22424          22424
unique      26        10          22424
top       p021        c0  img_97080.jpg
freq      1237      2489              1
  subject classname            img
0    p002        c0  img_44733.jpg
1    p002        c0  img_72999.jpg
2    p002        c0  img_25094.jpg
3    p002        c0  img_69092.jpg
4    p002        c0  img_92629.jpg
      subject classname            img
22419    p081        c9  img_56936.jpg
22420    p081        c9  img_46218.jpg
22421    p081        c9  img_25946.jpg
22422    p081        c9  img_67850.jpg
22423    p081        c9   img_9684.jpg

 subject knts:
p072     346
p042     591
p041     605
p039     651
p045     724
p002     725
p052     740
p050     790
p056     794
p061     809
p075     814
p064     820
p012     823
p081     823
p047     835
p035     848
p015     875
p014     876
p051     920
p049    1011
p066    1034
p016    1078
p026    1196
p024    1226
p022    1233
p021    1237
Name: subject, dtype: int64

notMNINST:
Extraction give you a set of directories, labelled A through J. The data consists of characters rendered in a variety of fonts on a 28x28 image. The labels are limited to 'A' through 'J' (10 classes). The training set has about 500k and the obsNewSet 19000 labelled examples. Given these sizes, it should be possible to train models quickly on any machine.

Sample images

Let's take a peek at some of the data to make sure it looks sensible.

In [6]:
trnFoldersPth = os.getcwd() + '/data/' + glbDataFile['trnFoldersPth']
newFoldersPth = os.getcwd() + '/data/' + glbDataFile['newFoldersPth']
# print(trnFoldersPth)
# print(newFoldersPth)

Display sample train images

Collect data corrections into glbDataScrub

In [7]:
def myreadImage(filePthNm):
    img = sk_io.imread(filePthNm)
    try:
        assert img.shape == glbImg['shape'], 'img.shape: %s' % \
            (img.shape)
        assert np.min(img) >= 0, 'img.min: %.4f' % \
            (np.min(img))
        assert np.max(img) <= glbImg['pxlDepth'], 'img.min: %.4f' % \
            (np.max(img))
    except AssertionError, e:
        print 'filePthNm: %s' % (filePthNm)
        print e
        raise
        
    return(img)

# plt.imshow(myreadImage(trnFoldersPth + '/c0/img_15117.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c8/img_67168.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_84986.jpg'))
plt.imshow(myreadImage(trnFoldersPth + '/c9/img_89196.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_95888.jpg'))
Out[7]:
<matplotlib.image.AxesImage at 0x116976c90>
In [17]:
smpClsImg = {}; smpN = 3
for cls in glbRspClass:
    clsImg = {}
#     print 'Class: %s' % (cls)
    clsPth = trnFoldersPth + '/' + cls
    onlyfiles = [f for f in os.listdir(clsPth) 
                    if os.path.isfile(os.path.join(clsPth, f))]
    for ix in np.random.randint(0, len(onlyfiles), size = smpN):
#         print '  %s:' % (onlyfiles[ix])
#         img = sk_io.imread(clsPth + '/' + onlyfiles[ix])
#         assert img.shape == (480, 640, 3), 'img.shape: %s' % (img.shape)
#         assert np.min(img) == 0, 'img.min: %.4f' % (np.min(img))
#         assert np.max(img) == glbImg['pxlDepth'], 'img.min: %.4f' % (np.max(img))        
        clsImg[onlyfiles[ix]] = myreadImage(clsPth + '/' + onlyfiles[ix])
#         jpgfile = Image(clsPth + '/' + onlyfiles[ix], format = 'jpg', 
#                         width = glbImg['size'] * 4, height = glbImg['size'] * 4)
#         display(jpgfile)

    smpClsImg[cls] = clsImg
    
# print smpClsImg    
        
figs, axes = plt.subplots(len(glbRspClass), smpN, 
                          figsize=(5 * smpN, 4 * len(glbRspClass)))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, cls in enumerate(smpClsImg.keys()):
    for j, imgFileName in enumerate(smpClsImg[cls].keys()):
        axes[i, j].imshow(smpClsImg[cls][imgFileName])
        axes[i, j].set_title(cls + ':' + imgFileName)
In [19]:
smpSbtImg = {}; smpN = 3
for sbt in driverDf['subject'].values[
        np.random.randint(0, len(driverDf['subject'].values), 
                          size = smpN)]:
    sbtImg = {}
#     print '  subject: %s' % (sbt)
    driverSbtDf = driverDf[driverDf['subject'] == sbt]
#     print driverSbtDf.shape
    
    clsPth = trnFoldersPth + '/' + cls
    onlyfiles = [f for f in os.listdir(clsPth) 
                    if os.path.isfile(os.path.join(clsPth, f))]
    for cls in driverSbtDf['classname'].values[
            np.random.randint(0, len(driverSbtDf['classname'].values), 
                              size = smpN)]:
#         print '    class: %s' % (cls)
#         print "    driverSbtDf[driverSbtDf['classname'] == cls]['img'].shape = %s" % \
#             (driverSbtDf[driverSbtDf['classname'] == cls]['img'].shape)

        imgFnm = driverSbtDf[driverSbtDf['classname'] == cls]['img'].iloc[0]    
        dctKey = cls + ':' + imgFnm
        imgFnm = trnFoldersPth + '/' + cls + '/' + imgFnm                    
#         img = sk_io.imread(imgFnm)
#         assert img.shape == (480, 640, 3), 'img.shape: %s' % (img.shape)
        sbtImg[dctKey] = myreadImage(imgFnm)
#         jpgfile = Image(clsPth + '/' + onlyfiles[ix], format = 'jpg', 
#                         width = glbImg['size'] * 4, height = glbImg['size'] * 4)
#         display(jpgfile)

    smpSbtImg[sbt] = sbtImg
    
# print smpClsImg    
        
nRow = smpN; nCol = smpN    
figs, axes = plt.subplots(nRow, nCol, 
                          figsize=(6 * nCol, 6 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, sbt in enumerate(smpSbtImg.keys()):
    for j, imgDesc in enumerate(smpSbtImg[sbt].keys()):
        axes[i, j].imshow(smpSbtImg[sbt][imgDesc])
        axes[i, j].set_title(sbt + ':' + imgDesc)
In [21]:
def mytransformImage(raw, retVals = 'final'):
    assert retVals in ['final', 'each'], \
        'unsupported retVals option: %s' % (retVals)
    
    prcImgDct = {'raw': raw, 'fnl': raw.astype(float)}
    fnlShape = rawShape = raw.shape
    
    # 'crop'
    if ('crop' in glbImg.keys()):
        xmin = 0; xmax = rawShape[1]
        ymin = 0; ymax = rawShape[0]        
        if ('x' in glbImg['crop'].keys()):
            xmin, xmax = glbImg['crop']['x']
        if ('y' in glbImg['crop'].keys()):
            ymin, ymax = glbImg['crop']['y']
        
        if retVals == 'each':
            prcImgDct['crp'] = sk_transform.resize(raw[ymin : ymax, 
                                                       xmin : xmax], 
                                                   rawShape)
        prcImgDct['fnl'] = sk_transform.resize(
                            prcImgDct['fnl'][ymin : ymax, xmin : xmax], 
                                               rawShape)
    # 'size'        
#     if not glbImg['color']:        
#         fnlShape = (glbImg['size'], glbImg['size'], 1)
#     else:    
#         fnlShape = (glbImg['size'], glbImg['size'], rawShape[2])
    fnlShape = (glbImg['size'], glbImg['size'], rawShape[2])        
    if (rawShape != fnlShape):
        if retVals == 'each':
            prcImgDct['sze'] = sk_transform.resize(raw, fnlShape)
        prcImgDct['fnl'] = sk_transform.resize(prcImgDct['fnl'], fnlShape)
           
    # 'color'        
    if not glbImg['color']:
        if retVals == 'each':        
            prcImgDct['gry'] = sk_color.rgb2gray(raw)
        prcImgDct['fnl'] = sk_color.rgb2gray(prcImgDct['fnl'])
        
    # 'center_scale'            
    if glbImg['center_scale']:
        if retVals == 'each':        
            prcImgDct['c_s'] = (raw.astype(float) - glbImg['pxlDepth'] / 2.0) / \
                                glbImg['pxlDepth']
        prcImgDct['fnl'] = (prcImgDct['fnl'] - glbImg['pxlDepth'] / 2.0) / \
                                glbImg['pxlDepth']
        
    if retVals == 'final':
        return prcImgDct['fnl']
    else:
        return prcImgDct
        
sbt = smpSbtImg.keys()[0]
tstRawImg = smpSbtImg[sbt][smpSbtImg[sbt].keys()[0]]
tstPrcImg = mytransformImage(tstRawImg, retVals = 'final')
nRow = 1; nCol = 2
figs, axes = plt.subplots(nRow, nCol, 
                          figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for j, typImg in enumerate(range(2)):
    if (j == 0):
        axes[j].imshow(tstRawImg)
        axes[j].set_title('raw')
    if (j == 1):
        if not glbImg['color']:
            plt.imshow(tstPrcImg, cmap = 'gray')
        else:    
            plt.imshow(tstPrcImg)        
        axes[j].set_title('fnl')            
plt.show()        
    
tstPrcImg = mytransformImage(tstRawImg, retVals = 'each')
nRow = 1; nCol = 2
figs, axes = plt.subplots(nRow, nCol, 
                          figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for j, typImg in enumerate(range(2)):
    if (j == 0):
        axes[j].imshow(tstRawImg)
        axes[j].set_title('raw')        
    if (j == 1):
        if not glbImg['color']:
            plt.imshow(tstPrcImg['fnl'], cmap = 'gray')
        else:    
            plt.imshow(tstPrcImg['fnl'])        
        axes[j].set_title('fnl')            
nRow = 1; nCol = len(tstPrcImg.values()) - 2
figs, axes = plt.subplots(nRow, nCol, 
                          figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for j, typImg in enumerate(list(set(tstPrcImg.keys()) - set(['raw', 'fnl']))):
    if (typImg == 'gry'):
        axes[j].imshow(tstPrcImg[typImg], cmap = 'gray')
    else:    
        axes[j].imshow(tstPrcImg[typImg])
    axes[j].set_title(typImg)
In [23]:
smpSbt0Img = smpSbtImg[smpSbtImg.keys()[0]]
smpPrcImg = {}
for key, value in smpSbt0Img.items():
    smpPrcImg[smpSbtImg.keys()[0] + ':' + key] = value
    
print 'smpPrcImg.keys(): %s' % (smpPrcImg.keys())
for key, raw in smpPrcImg.items():
    prcImgDct = mytransformImage(raw, retVals = 'each')        
    smpPrcImg[key] = prcImgDct

# Ideally 'fnl' should be the last col in the plot    
nRow = len(smpPrcImg.keys()); nCol = len(smpPrcImg.values()[0].keys())
# print 'nRow: %d; nCol: %d' % (nRow, nCol)
figs, axes = plt.subplots(nRow, nCol, 
                          figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, sbtClsImgFnm in enumerate(smpPrcImg.keys()):
    for j, typImg in enumerate(smpPrcImg[sbtClsImgFnm].keys()):
        if ((typImg == 'gry') or 
            ((typImg == 'fnl') and ('gry' in smpPrcImg[sbtClsImgFnm].keys()))):
            if (nRow > 1):
                axes[i, j].imshow(smpPrcImg[sbtClsImgFnm][typImg], cmap = 'gray')
            else:
                axes[j].imshow(smpPrcImg[sbtClsImgFnm][typImg], cmap = 'gray')
        else:    
            if (nRow > 1):            
                axes[i, j].imshow(smpPrcImg[sbtClsImgFnm][typImg])
            else:    
                axes[j].imshow(smpPrcImg[sbtClsImgFnm][typImg])
        if (nRow > 1):        
            axes[i, j].set_title(sbtClsImgFnm + ':' + typImg)
        else:    
            axes[j].set_title(sbtClsImgFnm + ':' + typImg)        
smpPrcImg.keys(): ['p016:c9:img_57609.jpg', 'p016:c8:img_100735.jpg']

Display sample test images

In [25]:
onlyfiles = [f for f in os.listdir(newFoldersPth) 
                    if os.path.isfile(os.path.join(newFoldersPth, f))]
# print onlyfiles[:5]

smpNewImg = {}; smpN = 3
# print smpN ** 2
# print np.random.randint(0, len(onlyfiles), size = smpN ** 2)
for imgFnm in [onlyfiles[ix] 
               for ix in np.random.randint(0, len(onlyfiles), size = smpN ** 2)]:

#     print '  imgFnm: %s' % (imgFnm)

#     img = sk_io.imread(newFoldersPth + '/' + imgFnm)
#     assert img.shape == (480, 640, 3), 'img.shape: %s' % (img.shape)
    smpNewImg[imgFnm] = myreadImage(newFoldersPth + '/' + imgFnm)
        
nRow = smpN; nCol = smpN    
figs, axes = plt.subplots(nRow, nCol, 
                          figsize=(6 * nCol, 5 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, imgFnm in enumerate(smpNewImg.keys()):
    axes[i / nCol, i % nCol].imshow(smpNewImg[imgFnm])
    axes[i / nCol, i % nCol].set_title(imgFnm)

notMNINST:

Each exemplar should be an image of a character A through J rendered in a different font.

In [26]:
# Display sample train images
# train_folders_path = '/Users/bbalaji-2012/Documents/Work/Courses/Udacity/DeepLearning/code/tensorflow/examples/udacity/data/notMNIST_large/'
# glbImg['size'] = 28
# display(Image(train_folders_path + 'A/a2F6b28udHRm.png', \
#               width = glbImg['size'] * 4, height = glbImg['size'] * 4))
# display(Image(train_folders_path + 'B/bnVuaS50dGY=.png', \
#               width = glbImg['size'] * 4, height = glbImg['size'] * 4))
# display(Image(train_folders_path + 'C/cmlzay50dGY=.png', \
#               width = glbImg['size'] * 4, height = glbImg['size'] * 4))

Populate database

Now let's load the data in a more manageable format.

We'll convert the entire dataset into a 3D array (image index, x, y) of floating point values, normalized to have approximately zero mean (notMNIST only: and standard deviation ~0.5) to make training easier down the road. The labels will be stored into a separate array (notMNINST only: of integers 0 through 9.)

A few images might not be readable, we'll just skip them.

In [27]:
trnFolders = os.getcwd() + '/data/' + glbDataFile['trnFoldersPth']
trnFolders = [trnFolders + '/' + cls for cls in glbRspClass]
print 'trnFolders: %s' % (trnFolders)
newFolders = [os.getcwd() + '/data/' + glbDataFile['newFoldersPth']]
print 'newFolders: %s' % (newFolders)
trnFolders: ['/Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c0', '/Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c1', '/Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c2', '/Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c3', '/Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c4', '/Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c5', '/Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c6', '/Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c7', '/Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c8', '/Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c9']
newFolders: ['/Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/test']
In [28]:
# data_folders_path = '/Users/bbalaji-2012/Documents/Work/Courses/Udacity/DeepLearning/code/tensorflow/examples/udacity/data/'
# train_folders = [data_folders_path + 'notMNIST_large/' + d \
#                  for d in sorted(os.listdir(data_folders_path + 'notMNIST_large/')) \
#                     if d != '.DS_Store']
# print train_folders
# test_folders  = [data_folders_path + 'notMNIST_small/' + d \
#                  for d in sorted(os.listdir(data_folders_path + 'notMNIST_small/')) \
#                     if d != '.DS_Store']
# print test_folders
In [29]:
#from scipy import misc as sp_misc
In [30]:
def load(idClass, folderPth, nImgMax, maxCheck = True, verbose = False):
  
    assert isinstance(idClass, str), \
        'expecting type(idClass) as str, not %s' % (type(idClass))  

    assert isinstance(folderPth, str), \
        'expecting type(folderPth) as str, not %s' % (type(folderPth))  
    
    assert nImgMax > 0, \
        'nImgMax: %d has to be > 0' % (nImgMax)  
    
    assert isinstance(maxCheck, bool), \
        'expecting type(maxCheck) as bool, not %s' % (type(maxCheck))  
    
    startTm = datetime.now()  
    
    ids = ['' for ix in xrange(nImgMax)]  
    dataset = np.ndarray(
        shape=(nImgMax, glbImg['size'], glbImg['size']), dtype=np.float32)
    labels = np.ndarray(shape=(nImgMax), dtype=np.int32)
#   label_index = 0
    try:
        labelsVal = glbRspClass.index(idClass)
    except ValueError, e:
        print 'unknown class: %s; defaulting label to -1' % (idClass)
        labelsVal = -1
    except Exception, e:
        print(e)
        raise
        
    labels[:] = labelsVal  
    image_index = 0

#   if isinstance(data_folders, str):
#     data_folders = [data_folders]

#   for fldrIx, folder in enumerate(data_folders):
    print 'Class: %s; Folder: %s' % (idClass, folderPth)
#     print(os.listdir(folder)[:6])
    for image in os.listdir(folderPth):
#       print(image)
#       print((image_index >= (nImgMax / len(data_folders) * (fldrIx + 1))))
      if maxCheck and (image_index >= nImgMax):
        raise Exception('More images than expected: %d >= %d' % (
          image_index, nImgMax))
#       elif (image_index >= (nImgMax / len(data_folders) * (fldrIx + 1))):
      elif image_index >= nImgMax: break
        
      image_file = os.path.join(folderPth, image)
      try:
        rawImg = myreadImage(image_file)
      except IOError as e:
        print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')  
        next
        
      prcImg = mytransformImage(rawImg, retVals = 'final')  
#       try:
#         rsz_image_data = sp_misc.imresize(ndimage.imread(image_file, flatten = not glbImgColor), 
#                                       (glbImg['size'], glbImg['size']))
#         image_data = (rsz_image_data.astype(float) -
#                       glbImgPixelDepth / 2) / glbImgPixelDepth
#         if image_data.shape != (glbImg['size'], glbImg['size']):
#           raise Exception('Unexpected image shape: %s' % str(image_data.shape))
        
      ids[image_index] = image
      dataset[image_index, :, :] = prcImg
#       labels[image_index] = label_index
        
      if mydspVerboseTrigger(image_index): 
#             print '  image_index: %d; %s:' % (image_index, image)
            print '  image_index: %5d (%5d secs)' % \
                (image_index, (datetime.now() - startTm).seconds)
            if verbose:
                nRow = 1; nCol = 2
                figs, axes = plt.subplots(nRow, nCol, 
                                              figsize=(6 * nCol, 4 * nRow))
                [(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) 
                     for ax in axes.flatten()]
                for j, typImg in enumerate(range(0, nCol)):
                    if (j == 0):
                        axes[j].imshow(rawImg)
    #                     axes[j].set_title(glbRspClass[label_index] + ':' + image + ':raw')                    
                        axes[j].set_title(idClass + ':' + image + ':raw')                                        
                    else:    
                        if not glbImg['color']:
                            axes[j].imshow(prcImg, cmap = 'gray')
                        else:    
                            axes[j].imshow(prcImg)
                        axes[j].set_title('fnl')
    #             display(sp_misc.toimage(rsz_image_data))
                plt.show()
            
      image_index += 1            
#     label_index += 1
    
    num_images = image_index
    ids = ids[0:num_images]  
    dataset = dataset[0:num_images, :, :]
    labels = labels[0:num_images]
#   if num_images < min_num_images:
#     raise Exception('Many fewer images than expected: %d < %d' % (
#         num_images, min_num_images))
    print('  Identifiers:', len(ids))
    print('  Full dataset tensor:', dataset.shape)
    print('  Mean:', np.mean(dataset))
    print('  Standard deviation:', np.std(dataset))
    print('  Labels:', labels.shape)
    print('  Label Knts:'); print(pd.Series(labels).value_counts())    
    
    return {'Cls': idClass, 'Dbs': {'Idn': ids, 'Ftr': dataset, 'Rsp': labels}}

smpC5ObsTrnDct = load('c5', trnFolders[5], 25, maxCheck = False, verbose = True)
smpObsNewDct = load('new', newFolders[0], 25, maxCheck = False, verbose = False)
# smqObsTrnIdn, smqObsTrnFtr, smqObsTrnRsp = load(trnFolders, 250, 
#                                                 max_check = False)
# print smpObsTrnRsp.value_counts()
# print smpObsTrnIdn[10:15]
# glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp = load(trnFolders, 22435)
Class: c5; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c5
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    1 secs)
  image_index:     6 (    1 secs)
  image_index:     8 (    2 secs)
  image_index:    20 (    3 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.11314436)
('  Standard deviation:', 0.32475406)
('  Labels:', (25,))
  Label Knts:
5    25
dtype: int64
unknown class: new; defaulting label to -1
Class: new; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/test
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    0 secs)
  image_index:     6 (    0 secs)
  image_index:     8 (    0 secs)
  image_index:    20 (    1 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.10439148)
('  Standard deviation:', 0.33399168)
('  Labels:', (25,))
  Label Knts:
-1    25
dtype: int64

Compare sequential vs. parallel loading results

In [31]:
thsBgnTm = datetime.now()
smqObsTrnLst = []
# for cls in glbRspClass[-2:]:
for cls in glbRspClass:    
    smqClsObsTrnDct = load(cls, trnFolders[glbRspClass.index(cls)], 25, 
                           maxCheck = False, verbose = False)
    smqObsTrnLst.append(smqClsObsTrnDct)

print 'len(smqObsTrnLst): %d' % (len(smqObsTrnLst))    
thsDurDff = (datetime.now() - thsBgnTm).seconds  
print 'Trn Smp Sequential load duration: %0.2f seconds' % (thsDurDff) 
Class: c0; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c0
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    0 secs)
  image_index:     6 (    0 secs)
  image_index:     8 (    0 secs)
  image_index:    20 (    1 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.10371291)
('  Standard deviation:', 0.3219693)
('  Labels:', (25,))
  Label Knts:
0    25
dtype: int64
Class: c1; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c1
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    0 secs)
  image_index:     6 (    0 secs)
  image_index:     8 (    0 secs)
  image_index:    20 (    1 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.11209381)
('  Standard deviation:', 0.32118186)
('  Labels:', (25,))
  Label Knts:
1    25
dtype: int64
Class: c2; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c2
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    0 secs)
  image_index:     6 (    0 secs)
  image_index:     8 (    0 secs)
  image_index:    20 (    1 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.10655484)
('  Standard deviation:', 0.33100829)
('  Labels:', (25,))
  Label Knts:
2    25
dtype: int64
Class: c3; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c3
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    0 secs)
  image_index:     6 (    0 secs)
  image_index:     8 (    0 secs)
  image_index:    20 (    1 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.099427894)
('  Standard deviation:', 0.32730374)
('  Labels:', (25,))
  Label Knts:
3    25
dtype: int64
Class: c4; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c4
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    0 secs)
  image_index:     6 (    0 secs)
  image_index:     8 (    0 secs)
  image_index:    20 (    1 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.10423807)
('  Standard deviation:', 0.31945148)
('  Labels:', (25,))
  Label Knts:
4    25
dtype: int64
Class: c5; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c5
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    0 secs)
  image_index:     6 (    0 secs)
  image_index:     8 (    0 secs)
  image_index:    20 (    1 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.11314436)
('  Standard deviation:', 0.32475406)
('  Labels:', (25,))
  Label Knts:
5    25
dtype: int64
Class: c6; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c6
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    0 secs)
  image_index:     6 (    0 secs)
  image_index:     8 (    0 secs)
  image_index:    20 (    1 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.099318922)
('  Standard deviation:', 0.33076665)
('  Labels:', (25,))
  Label Knts:
6    25
dtype: int64
Class: c7; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c7
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    0 secs)
  image_index:     6 (    0 secs)
  image_index:     8 (    0 secs)
  image_index:    20 (    1 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.087649062)
('  Standard deviation:', 0.32005942)
('  Labels:', (25,))
  Label Knts:
7    25
dtype: int64
Class: c8; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c8
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    0 secs)
  image_index:     6 (    0 secs)
  image_index:     8 (    0 secs)
  image_index:    20 (    1 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.10032745)
('  Standard deviation:', 0.32751)
('  Labels:', (25,))
  Label Knts:
8    25
dtype: int64
Class: c9; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c9
  image_index:     0 (    0 secs)
  image_index:     2 (    0 secs)
  image_index:     4 (    0 secs)
  image_index:     6 (    0 secs)
  image_index:     8 (    0 secs)
  image_index:    20 (    1 secs)
('  Identifiers:', 25)
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.086796135)
('  Standard deviation:', 0.33149031)
('  Labels:', (25,))
  Label Knts:
9    25
dtype: int64
len(smqObsTrnLst): 10
Sequential load duration: 21.00 seconds
In [32]:
thsBgnTm = datetime.now()
smrObsTrnLst = Parallel(n_jobs = nCores, verbose = 1)(delayed(
        load)(cls, trnFolders[glbRspClass.index(cls)], 25, 
                maxCheck = False, verbose = False) for cls in glbRspClass)
print 'len(smrObsTrnLst): %d' % (len(smrObsTrnLst))    
thsDurDff = (datetime.now() - thsBgnTm).seconds  
print 'Trn Smp Parallel load duration: %0.2f seconds' % (thsDurDff) 
len(smrObsTrnLst): 10
Parallel load duration: 3.00 seconds
Class: c0; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c0
Class: c1; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c1
Class: c2; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c2
Class: c3; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c3
Class: c4; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c4
Class: c5; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c5
Class: c6; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c6
Class: c7; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c7
Class: c9; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c9
Class: c8; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c8
  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)









  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)









  image_index:     4 (    0 secs)  image_index:     4 (    0 secs)  image_index:     4 (    0 secs)  image_index:     4 (    0 secs)  image_index:     4 (    0 secs)  image_index:     4 (    0 secs)  image_index:     4 (    0 secs)  image_index:     4 (    0 secs)  image_index:     4 (    0 secs)  image_index:     4 (    0 secs)









  image_index:     6 (    0 secs)  image_index:     6 (    0 secs)  image_index:     6 (    0 secs)  image_index:     6 (    0 secs)  image_index:     6 (    0 secs)  image_index:     6 (    0 secs)  image_index:     6 (    0 secs)  image_index:     6 (    0 secs)  image_index:     6 (    0 secs)  image_index:     6 (    0 secs)









  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)









  image_index:    20 (    2 secs)  image_index:    20 (    2 secs)  image_index:    20 (    2 secs)  image_index:    20 (    2 secs)  image_index:    20 (    2 secs)  image_index:    20 (    2 secs)  image_index:    20 (    2 secs)  image_index:    20 (    2 secs)  image_index:    20 (    2 secs)  image_index:    20 (    2 secs)









('  Identifiers:', 25)('  Identifiers:', 25)('  Identifiers:', 25)('  Identifiers:', 25)('  Identifiers:', 25)('  Identifiers:', 25)('  Identifiers:', 25)('  Identifiers:', 25)('  Identifiers:', 25)('  Identifiers:', 25)









('  Full dataset tensor:', (25, 64, 64))
('  Full dataset tensor:', (25, 64, 64))
('  Full dataset tensor:', (25, 64, 64))
('  Full dataset tensor:', (25, 64, 64))
('  Full dataset tensor:', (25, 64, 64))
('  Full dataset tensor:', (25, 64, 64))
('  Full dataset tensor:', (25, 64, 64))
('  Full dataset tensor:', (25, 64, 64))
('  Full dataset tensor:', (25, 64, 64))
('  Full dataset tensor:', (25, 64, 64))
('  Mean:', -0.10371291)
('  Mean:', -0.11209381)
('  Mean:', -0.10655484)
('  Mean:', -0.099427894)
('  Mean:', -0.10423807)
('  Mean:', -0.11314436)
('  Mean:', -0.099318922)
('  Mean:', -0.087649062)
('  Mean:', -0.086796135)
('  Mean:', -0.10032745)
('  Standard deviation:', 0.3219693)
('  Standard deviation:', 0.32118186)
('  Standard deviation:', 0.33100829)
('  Standard deviation:', 0.32730374)
('  Standard deviation:', 0.31945148)
('  Standard deviation:', 0.32475406)
('  Standard deviation:', 0.33076665)
('  Standard deviation:', 0.32005942)
('  Standard deviation:', 0.33149031)
('  Standard deviation:', 0.32751)
('  Labels:', (25,))
('  Labels:', (25,))
('  Labels:', (25,))
('  Labels:', (25,))
('  Labels:', (25,))
('  Labels:', (25,))
('  Labels:', (25,))
('  Labels:', (25,))
('  Labels:', (25,))
('  Labels:', (25,))
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:
0    25
dtype: int641    25
dtype: int642    25
dtype: int643    25
dtype: int644    25
dtype: int645    25
dtype: int646    25
dtype: int647    25
dtype: int649    25
dtype: int648    25
dtype: int64









[Parallel(n_jobs=14)]: Done  10 out of  10 | elapsed:    3.3s finished
In [33]:
def myisEqualDct(d1, d2):
    d1_keys = set(d1.keys())
    d2_keys = set(d2.keys())
    intersect_keys = d1_keys.intersection(d2_keys)
    added = d1_keys - d2_keys
    removed = d2_keys - d1_keys
    
#     modified = {o : (d1[o], d2[o]) for o in intersect_keys if d1[o] != d2[o]}
    modified = {}
    for o in intersect_keys:
        if not (isinstance(d1[o], dict)):
            try:
                eql = d1[o] == d2[o]
    #             eql = (d1[o] == d2[o]) if not (isinstance(d1[o], dict)) else \
    #                   myisEqualDct(d1[o], d2[o])
            except ValueError, e:
                print e
                print 'key: %s: type:' % (o) 
                print type(d1[o]).mro()
                raise
        else: eql = myisEqualDct(d1[o], d2[o])
        if not isinstance(eql, bool):
#             print 'eql:'; print eql    
            eql = eql.all()
        if not eql: modified[o] = eql
        
    same = set(o for o in intersect_keys if not o in modified.keys())
    
    if (len(added) > 0):
        print '     added: %s' % (added)
    if (len(removed) > 0):
        print '   removed: %s' % (removed)
    if (len(modified) > 0):
        print '  modified: %s' % (modified)        
    if (len(same) != len(d1_keys)):
        print '      same: %s' % (same)        
    
    return ((len(added)    == 0) and 
            (len(removed)  == 0) and 
            (len(modified) == 0) and             
            (len(same)     == len(d2_keys)))

tstAB1Dct = {'a': 1, 'b': 1}; tstAB2Dct = {'a': 1, 'b': 2}
print myisEqualDct(tstAB1Dct, tstAB1Dct) 
print myisEqualDct(tstAB1Dct, tstAB2Dct) 
tstABC1Dct = {'ab': tstAB1Dct, 'c' : 1}; 
tstABC2Dct = {'ab': tstAB2Dct, 'c' : 3}; 
print myisEqualDct(tstABC1Dct, tstABC1Dct) 
print myisEqualDct(tstABC1Dct, tstABC2Dct) 
True
  modified: {'b': False}
      same: set(['a'])
False
True
  modified: {'b': False}
      same: set(['a'])
  modified: {'c': False, 'ab': False}
      same: set([])
False
In [34]:
print 'len(smqObsTrnLst): %d' % (len(smqObsTrnLst)) 
print 'len(smrObsTrnLst): %d' % (len(smrObsTrnLst)) 
for clsIx in range(len(glbRspClass)):
#     print 'clsIx: %s' % (clsIx)
#     print "type(smqObsTrnLst[clsIx]['Dbs']):" 
#     print    (str(type(smqObsTrnLst[clsIx]['Dbs']).mro()))    
#     print "type(smqObsTrnLst[clsIx]['Dbs']): %s" \
#         (str(type(smqObsTrnLst[clsIx]['Dbs']).mro()))    
#     print smqObsTrnLst[clsIx]
    assert myisEqualDct(smqObsTrnLst[clsIx], smrObsTrnLst[clsIx]), \
        'diff in class: %s' % glbRspClass[clsIx]        
len(smqObsTrnLst): 10
len(smrObsTrnLst): 10
In [49]:
# print 'numpy.ndarray' in type(smqObsTrnLst[9]['Dbs']['Rsp']).mro()
# print type(smqObsTrnLst[9]['Dbs']['Rsp'])
# print smqObsTrnLst[9]['Dbs']['Rsp'].shape
# print smqObsTrnLst[9]['Dbs']['Rsp']

# print type(smrObsTrnLst[9]['Dbs']['Rsp'])
# print smrObsTrnLst[9]['Dbs']['Rsp'].shape
# print smrObsTrnLst[9]['Dbs']['Rsp']
# print pd.Series(smrObsTrnRsp[9]['Dbs']['Rsp'])
# print pd.Series(smrObsTrnRsp[9]['Dbs']['Rsp']).value_counts()

tstArr = smrObsTrnLst[9]['Dbs']['Rsp']
print pd.Series(tstArr)
0     9
1     9
2     9
3     9
4     9
5     9
6     9
7     9
8     9
9     9
10    9
11    9
12    9
13    9
14    9
15    9
16    9
17    9
18    9
19    9
20    9
21    9
22    9
23    9
24    9
dtype: int32
In [52]:
def mybuildDatabase(lclObsLst):
    # lclObsLst dictionary structure:
    #   {'Cls': idClass, 'Dbs': {'Idn': ids, 'Ftr': dataset, 'Rsp': labels}
    lclObsIdn = []
#     assert isinstance(lclObsIdn, list), 'lclObsIdn is not a list'
#     print 'type(lclObsIdn): %s' % type(lclObsIdn)
    lclObsFtr = lclObsRsp = None   
    for clsIx in range(len(lclObsLst)):
        lclObsIdn.extend(lclObsLst[clsIx]['Dbs']['Idn'])
        lclObsFtr = np.vstack((lclObsFtr, 
                               lclObsLst[clsIx]['Dbs']['Ftr'])) \
            if not (lclObsFtr == None) else lclObsLst[clsIx]['Dbs']['Ftr']
        lclObsRsp = np.hstack((lclObsRsp, 
                               lclObsLst[clsIx]['Dbs']['Rsp'])) \
            if not (lclObsRsp == None) else lclObsLst[clsIx]['Dbs']['Rsp']
#     print lclObsIdn    
    return lclObsIdn, lclObsFtr, lclObsRsp
    
smrObsTrnIdn, smrObsTrnFtr, smrObsTrnRsp = mybuildDatabase(smrObsTrnLst)
print('Identifiers:', len(smrObsTrnIdn))
print('Sample dataset tensor:', smrObsTrnFtr.shape)
print('Mean:', np.mean(smrObsTrnFtr))
print('Standard deviation:', np.std(smrObsTrnFtr))
print('Labels:', smrObsTrnRsp.shape)
# print(smrObsTrnRsp[25:30])
print('Label Knts:'); print(pd.Series(smrObsTrnRsp).value_counts())
('Identifiers:', 250)
('Full dataset tensor:', (250, 64, 64))
('Mean:', -0.10132633)
('Standard deviation:', 0.32568815)
('Labels:', (250,))
Label Knts:
9    25
8    25
7    25
6    25
5    25
4    25
3    25
2    25
1    25
0    25
dtype: int64
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:12: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:15: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
In [53]:
thsBgnTm = datetime.now()
glbObsTrnLst = Parallel(n_jobs = nCores, verbose = 1)(delayed(
        load)(cls, trnFolders[glbRspClass.index(cls)], 2500, 
                maxCheck = True, verbose = False) for cls in glbRspClass)
print 'len(glbObsTrnLst): %d' % (len(glbObsTrnLst))    
thsDurDff = (datetime.now() - thsBgnTm).seconds  
print 'Trn Parallel load duration: %0.2f seconds' % (thsDurDff) 
len(glbObsTrnLst): 10
Parallel load duration: 378.00 seconds
Class: c0; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c0
Class: c1; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c1
Class: c2; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c2
Class: c3; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c3
Class: c4; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c4
Class: c5; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c5
Class: c6; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c6
Class: c7; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c7
Class: c8; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c8
Class: c9; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/train/c9
  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)  image_index:     0 (    0 secs)









  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)  image_index:     2 (    0 secs)









  image_index:     4 (    1 secs)  image_index:     4 (    1 secs)  image_index:     4 (    1 secs)  image_index:     4 (    1 secs)  image_index:     4 (    1 secs)  image_index:     4 (    1 secs)  image_index:     4 (    1 secs)  image_index:     4 (    1 secs)  image_index:     4 (    1 secs)  image_index:     4 (    1 secs)









  image_index:     6 (    1 secs)  image_index:     6 (    1 secs)  image_index:     6 (    1 secs)  image_index:     6 (    1 secs)  image_index:     6 (    1 secs)  image_index:     6 (    1 secs)  image_index:     6 (    1 secs)  image_index:     6 (    1 secs)  image_index:     6 (    1 secs)  image_index:     6 (    1 secs)









  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)  image_index:     8 (    1 secs)









  image_index:    20 (    3 secs)  image_index:    20 (    3 secs)  image_index:    20 (    3 secs)  image_index:    20 (    3 secs)  image_index:    20 (    3 secs)  image_index:    20 (    3 secs)  image_index:    20 (    3 secs)  image_index:    20 (    3 secs)  image_index:    20 (    3 secs)  image_index:    20 (    3 secs)









  image_index:    40 (    5 secs)  image_index:    40 (    6 secs)  image_index:    40 (    6 secs)  image_index:    40 (    6 secs)  image_index:    40 (    5 secs)  image_index:    40 (    6 secs)  image_index:    40 (    5 secs)  image_index:    40 (    5 secs)  image_index:    40 (    5 secs)  image_index:    40 (    5 secs)









  image_index:    60 (    8 secs)  image_index:    60 (    8 secs)  image_index:    60 (    8 secs)  image_index:    60 (    8 secs)  image_index:    60 (    8 secs)  image_index:    60 (    8 secs)  image_index:    60 (    8 secs)  image_index:    60 (    8 secs)  image_index:    60 (    8 secs)  image_index:    60 (    8 secs)









  image_index:    80 (   10 secs)  image_index:    80 (   11 secs)  image_index:    80 (   11 secs)  image_index:    80 (   11 secs)  image_index:    80 (   11 secs)  image_index:    80 (   11 secs)  image_index:    80 (   11 secs)  image_index:    80 (   11 secs)  image_index:    80 (   11 secs)  image_index:    80 (   10 secs)









  image_index:   200 (   28 secs)  image_index:   200 (   28 secs)  image_index:   200 (   28 secs)  image_index:   200 (   28 secs)  image_index:   200 (   29 secs)  image_index:   200 (   31 secs)  image_index:   200 (   29 secs)  image_index:   200 (   28 secs)  image_index:   200 (   28 secs)  image_index:   200 (   29 secs)









  image_index:   400 (   57 secs)  image_index:   400 (   57 secs)  image_index:   400 (   57 secs)  image_index:   400 (   56 secs)  image_index:   400 (   58 secs)  image_index:   400 (   61 secs)  image_index:   400 (   57 secs)  image_index:   400 (   56 secs)  image_index:   400 (   56 secs)  image_index:   400 (   57 secs)









  image_index:   600 (   84 secs)  image_index:   600 (   84 secs)  image_index:   600 (   85 secs)  image_index:   600 (   84 secs)  image_index:   600 (   86 secs)  image_index:   600 (   91 secs)  image_index:   600 (   85 secs)  image_index:   600 (   84 secs)  image_index:   600 (   85 secs)  image_index:   600 (   85 secs)









  image_index:   800 (  117 secs)  image_index:   800 (  117 secs)  image_index:   800 (  117 secs)  image_index:   800 (  117 secs)  image_index:   800 (  120 secs)  image_index:   800 (  123 secs)
[Parallel(n_jobs=14)]: Done  10 out of  10 | elapsed:  6.3min finished
  image_index:   800 (  119 secs)  image_index:   800 (  117 secs)  image_index:   800 (  117 secs)  image_index:   800 (  118 secs)









  image_index:  2000 (  316 secs)  image_index:  2000 (  314 secs)  image_index:  2000 (  314 secs)  image_index:  2000 (  318 secs)  image_index:  2000 (  322 secs)  image_index:  2000 (  318 secs)  image_index:  2000 (  315 secs)  image_index:  2000 (  312 secs)('  Identifiers:', 1911)  image_index:  2000 (  316 secs)









('  Identifiers:', 2489)('  Identifiers:', 2267)('  Identifiers:', 2317)('  Identifiers:', 2346)('  Identifiers:', 2326)('  Identifiers:', 2312)('  Identifiers:', 2325)('  Identifiers:', 2002)('  Full dataset tensor:', (1911, 64, 64))
('  Identifiers:', 2129)







('  Mean:', -0.10261098)

('  Full dataset tensor:', (2489, 64, 64))
('  Full dataset tensor:', (2267, 64, 64))
('  Full dataset tensor:', (2317, 64, 64))
('  Full dataset tensor:', (2346, 64, 64))
('  Full dataset tensor:', (2326, 64, 64))
('  Full dataset tensor:', (2312, 64, 64))
('  Full dataset tensor:', (2325, 64, 64))
('  Full dataset tensor:', (2002, 64, 64))
('  Standard deviation:', 0.32375273)('  Full dataset tensor:', (2129, 64, 64))
('  Mean:', -0.10268358)
('  Mean:', -0.1035179)
('  Mean:', -0.10422858)
('  Mean:', -0.10468703)
('  Mean:', -0.098030552)
('  Mean:', -0.098016791)
('  Mean:', -0.096125968)
('  Mean:', -0.094121233)

('  Mean:', -0.088676713)
('  Standard deviation:', 0.32731777)('  Standard deviation:', 0.32373857)('  Standard deviation:', 0.32976153)('  Standard deviation:', 0.32440081)('  Standard deviation:', 0.32597044)('  Standard deviation:', 0.32542631)('  Standard deviation:', 0.32982066)('  Standard deviation:', 0.32239816)('  Labels:', (1911,))
('  Standard deviation:', 0.32720071)







  Label Knts:

('  Labels:', (2489,))
('  Labels:', (2267,))
('  Labels:', (2317,))
('  Labels:', (2346,))
('  Labels:', (2326,))
('  Labels:', (2312,))
('  Labels:', (2325,))
('  Labels:', (2002,))
8    1911
dtype: int64('  Labels:', (2129,))
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:
  Label Knts:

  Label Knts:
0    2489
dtype: int641    2267
dtype: int642    2317
dtype: int643    2346
dtype: int644    2326
dtype: int645    2312
dtype: int646    2325
dtype: int647    2002
dtype: int649    2129
dtype: int64








In [54]:
glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp = mybuildDatabase(glbObsTrnLst)
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:12: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:15: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
('Identifiers:', 22424)
('Full dataset tensor:', (22424, 64, 64))
('Mean:', -0.099392936)
('Standard deviation:', 0.32611963)
('Labels:', (22424,))
Label Knts:
0    2489
3    2346
4    2326
6    2325
2    2317
5    2312
1    2267
9    2129
7    2002
8    1911
dtype: int64

We expect the data to be balanced across classes. Verify that.

In [6]:
print('Identifiers:', len(glbObsTrnIdn))
print('Full dataset tensor:', glbObsTrnFtr.shape)
print('Mean:', np.mean(glbObsTrnFtr))
print('Standard deviation:', np.std(glbObsTrnFtr))
print('Labels:', glbObsTrnRsp.shape)
print('Label Knts:'); print(pd.Series(glbObsTrnRsp).value_counts())
('Identifiers:', 22424)
('Full dataset tensor:', (22424, 64, 64))
('Mean:', -0.099392936)
('Standard deviation:', 0.32611963)
('Labels:', (22424,))
Label Knts:
0    2489
3    2346
4    2326
6    2325
2    2317
5    2312
1    2267
9    2129
7    2002
8    1911
dtype: int64

Inspect Resized Image Data

Let's verify that the data still looks good. Displaying a sample of the labels and images from the ndarray.

In [30]:
# print type(pd.Series(glbObsTrnRsp).value_counts())
# print pd.Series(glbObsTrnRsp).value_counts().sort_values(ascending = False)
# print pd.Series(glbObsTrnRsp).value_counts().sort_values(ascending = False).iloc[5]
# print pd.Series(glbObsTrnRsp).value_counts().sort_values(ascending = False).index[5]
print type(glbObsTrnRsp)
print glbObsTrnRsp.shape
print glbObsTrnRsp.shape[0]
print glbObsTrnRsp[10000:10005]
print type(range(glbObsTrnRsp.shape[0]))
print range(glbObsTrnRsp.shape[0])[10000:10005]
print (glbObsTrnRsp == 4)[10000:10005]
print type(np.array(range(glbObsTrnRsp.shape[0]))[glbObsTrnRsp == 4])
print np.array(range(glbObsTrnRsp.shape[0]))[glbObsTrnRsp == 4][10:15]
<type 'numpy.ndarray'>
(22424,)
22424
[4 4 4 4 4]
<type 'list'>
[10000, 10001, 10002, 10003, 10004]
[ True  True  True  True  True]
<type 'numpy.ndarray'>
[9429 9430 9431 9432 9433]
In [7]:
# Revised version down below (to display glbObsNew)
def mydisplayImages(obsIdn, obsFtr, obsRsp):
    clsSrs = pd.Series(obsRsp).value_counts().sort_values()
    nRow = clsSrs.shape[0]; nCol = 3
    figs, axes = plt.subplots(nRow, nCol, 
                          figsize=(6 * nCol, 6 * nRow))
    [(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) 
         for ax in axes.flatten()]
    for i, cls in enumerate(clsSrs.index):
        obsClsIx = np.array(range(obsRsp.shape[0]))[obsRsp == cls]
#         print 'cls: %s' % cls
#         print 'obsClsIx[:5]: '; print obsClsIx[:5]
        for j, smpIx in enumerate(
                np.random.randint(0, len(obsClsIx), nCol)):
            if glbImg['color']:
                axes[i, j].imshow(obsFtr[obsClsIx[smpIx], :, :])
            else:    
                axes[i, j].imshow(obsFtr[obsClsIx[smpIx], :, :], cmap = 'gray')
            axes[i, j].set_title(glbRspClass[cls] + ':' + 
                                 obsIdn[obsClsIx[smpIx]])    
    
    plt.show()
    
#     imgIxLst = np.random.random_integers(0, obsFtr.shape[0] - 1, 10)
#     for imgIx in imgIxLst:
#         if (obsRsp[imgIx] > -1):
#             print '  imgIx: %d; id: %s; label: %s' % \
#                 (imgIx, obsIdn[imgIx], glbRspClass[obsRsp[imgIx]])
#         else:    
#             print '  imgIx: %d; id: %s; label: None' % (imgIx, obsIdn[imgIx])    
#         plt.figure
#         plt.imshow(obsFtr[imgIx,:,:], cmap = plt.cm.gray)
#         plt.show()
        
# for i, sbt in enumerate(smpSubjects):
#     smpSbtDbImg = driverDf[driverDf.subject == sbt]
#     for j, imgDesc in enumerate(range(smpNImg)):
#         ixIdn = glbObsTrnIdn.index(smpSbtDbImg.iloc[j]['img'])
#         if glbImg['color']:
#             axes[i, j].imshow(glbObsTrnFtr[ixIdn, :, :])
#         else:    
#             axes[i, j].imshow(glbObsTrnFtr[ixIdn, :, :], cmap = 'gray')
#         axes[i, j].set_title(sbt + ':' + glbRspClass[glbObsTrnRsp[ixIdn]] + 
#                              ':' + driverDf.img[j])        

print 'glbObsTrn set:'; 
mydisplayImages(glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp)
glbObsTrn set:

Corrections found here

In [12]:
plt.imshow(myreadImage(trnFoldersPth + '/c8/img_26672.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_60822.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_89196.jpg'))
Out[12]:
<matplotlib.image.AxesImage at 0x110ec1490>
In [51]:
print driverDf.head()
  subject classname            img
0    p002        c0  img_44733.jpg
1    p002        c0  img_72999.jpg
2    p002        c0  img_25094.jpg
3    p002        c0  img_69092.jpg
4    p002        c0  img_92629.jpg
In [72]:
print driverDf[driverDf.subject == 'p014'].head()
print driverDf[driverDf.img == 'img_44733.jpg'].head()
     subject classname             img
1548    p014        c0   img_72495.jpg
1549    p014        c0   img_62101.jpg
1550    p014        c0   img_34774.jpg
1551    p014        c0  img_100312.jpg
1552    p014        c0   img_12279.jpg
  subject classname            img
0    p002        c0  img_44733.jpg
In [83]:
# smpSubjects selected to match 
#   https://github.com/ottogroup/statefarm/blob/master/statefarm_getting_started.ipynb
smpSubjects = []

# Updated function defined later ???
def lclDisplaySubjectSampleImages(smpSubjects, lclObsIdn, lclObsFtr, lclObsRsp):
    smpNImg = 3
    nRow = len(smpSubjects); nCol = smpNImg    
    figs, axes = plt.subplots(nRow, nCol, 
                              figsize=(6 * nCol, 6 * nRow))
    [(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) 
     for ax in axes.flatten()]
    for i, sbt in enumerate(smpSubjects):
        smpSbtDbImg = driverDf[driverDf.subject == sbt]
        # Select samples from different classes for each subject
        smpCls = [glbRspClass[clsIx] 
                  for clsIx in np.random.randint(0, glbRspClassN, smpNImg)]
#         print 'smpCls: '; print smpCls
        for j, cls in enumerate(smpCls):
            smpSbtClsDbImg = smpSbtDbImg[smpSbtDbImg.classname == cls]
#             print 'sbt: %s; cls: %s; smpSbtClsDbImg.shape:' % (sbt, cls)
#             print smpSbtClsDbImg.shape
#             print smpSbtClsDbImg.columns
#             print np.random.randint(0, smpSbtClsDbImg.shape[0], 1)[0]
#             print smpSbtClsDbImg.iloc[0]['img']
            
            ixIdn = lclObsIdn.index(smpSbtClsDbImg.iloc[
                    np.random.randint(0, smpSbtClsDbImg.shape[0], 1)[0]]['img'])
            if glbImg['color']:
                axes[i, j].imshow(lclObsFtr[ixIdn, :, :])
            else:    
                axes[i, j].imshow(lclObsFtr[ixIdn, :, :], cmap = 'gray')
#             axes[i, j].set_title(sbt + ':' + cls + 
#                                  ':' + smpSbtClsDbImg.iloc[0]['img'])
            axes[i, j].set_title(sbt + ':' + 
                                 glbRspClass[lclObsRsp[ixIdn]] + ':' + 
                                 lclObsIdn[ixIdn])
    plt.show()
    
lclDisplaySubjectSampleImages(['p002', 'p012', 'p014', 'p015', 'p016'],
                              glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp)   

Corrections found here

In [85]:
plt.imshow(myreadImage(trnFoldersPth + '/c9/img_71334.jpg'))
plt.imshow(myreadImage(trnFoldersPth + '/c7/img_73378.jpg'))
plt.imshow(myreadImage(trnFoldersPth + '/c9/img_79944.jpg'))
plt.imshow(myreadImage(trnFoldersPth + '/c9/img_92682.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c5/img_93438.jpg'))
Out[85]:
<matplotlib.image.AxesImage at 0x11f686c10>
In [49]:
# Modified above to display sample images from different classes
# smpSubjects selected to match 
#   https://github.com/ottogroup/statefarm/blob/master/statefarm_getting_started.ipynb
# smpSubjects = []

# def lclDisplaySubjectSampleImages(smpSubjects):
#     smpNImg = 3
#     nRow = len(smpSubjects); nCol = smpNImg    
#     figs, axes = plt.subplots(nRow, nCol, 
#                               figsize=(6 * nCol, 6 * nRow))
#     [(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
#     for i, sbt in enumerate(smpSubjects):
#         smpSbtDbImg = driverDf[driverDf.subject == sbt]
#         for j, imgDesc in enumerate(range(smpNImg)):
#             ixIdn = glbObsTrnIdn.index(smpSbtDbImg.iloc[j]['img'])
#             if glbImg['color']:
#                 axes[i, j].imshow(glbObsTrnFtr[ixIdn, :, :])
#             else:    
#                 axes[i, j].imshow(glbObsTrnFtr[ixIdn, :, :], cmap = 'gray')
#             axes[i, j].set_title(sbt + ':' + glbRspClass[glbObsTrnRsp[ixIdn]] + 
#                                  ':' + driverDf.img[j])
#     plt.show()
    
# lclDisplaySubjectSampleImages(['p002', 'p012', 'p014', 'p015', 'p016'])    
In [59]:
# dspLabels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']

# print 'train set:'
# imgIxLst = np.random.random_integers(0, glbObsTrnFtr.shape[0] - 1, 10)
# for imgIx in imgIxLst:
#     print 'imgIx: %d: label: %s' % (imgIx, dspLabels[glbObsTrnRsp[imgIx]])
#     plt.figure
#     plt.imshow(glbObsTrnFtr[imgIx,:,:], cmap = plt.cm.gray)
#     plt.show()

Move test images to different folders to parallelize. Change newObsTrnLst to glbObsNewLst

In [57]:
thsBgnTm = datetime.now()
newObsTrnLst = [load('new', newFolders[0], 80000, 
                           maxCheck = True, verbose = True)]
# smpObsNewDct = load('new', newFolders[0], 25, maxCheck = False, verbose = False)
print 'len(newObsTrnLst): %d' % (len(newObsTrnLst))    
thsDurDff = (datetime.now() - thsBgnTm).seconds  
print 'newObs load duration: %0.2f seconds' % (thsDurDff) 
unknown class: new; defaulting label to -1
Class: new; Folder: /Users/bbalaji-2012/Documents/Work/DataScience/Kaggle/StateFarm/data/imgs/test
  image_index:     0 (    2 secs)
  image_index:     2 (    2 secs)
  image_index:     4 (    3 secs)
  image_index:     6 (    4 secs)
  image_index:     8 (    4 secs)
  image_index:    20 (    5 secs)
  image_index:    40 (    8 secs)
  image_index:    60 (   10 secs)
  image_index:    80 (   12 secs)
  image_index:   200 (   22 secs)
  image_index:   400 (   39 secs)
  image_index:   600 (   55 secs)
  image_index:   800 (   72 secs)
  image_index:  2000 (  170 secs)
  image_index:  4000 (  335 secs)
  image_index:  6000 (  500 secs)
  image_index:  8000 (  663 secs)
  image_index: 20000 ( 1642 secs)
  image_index: 40000 ( 3349 secs)
  image_index: 60000 ( 5194 secs)
('  Identifiers:', 79726)
('  Full dataset tensor:', (79726, 64, 64))
('  Mean:', -0.097465999)
('  Standard deviation:', 0.33075851)
('  Labels:', (79726,))
  Label Knts:
-1    79726
dtype: int64
len(newObsTrnLst): 1
newObs load duration: 6999.00 seconds
In [58]:
glbObsNewLst = newObsTrnLst
In [59]:
glbObsNewIdn, glbObsNewFtr, glbObsNewRsp = mybuildDatabase(glbObsNewLst)
('Identifiers:', 79726)
('New Full dataset tensor:', (79726, 64, 64))
('Mean:', -0.097465999)
('Standard deviation:', 0.33075851)
('Labels:', (79726,))
Label Knts:
-1    79726
dtype: int64
In [13]:
print('Identifiers:', len(glbObsNewIdn))
print('New Full dataset tensor:', glbObsNewFtr.shape)
print('Mean:', np.mean(glbObsNewFtr))
print('Standard deviation:', np.std(glbObsNewFtr))
print('Labels:', glbObsNewRsp.shape)
print('Label Knts:'); print(pd.Series(glbObsNewRsp).value_counts())
('Identifiers:', 79726)
('New Full dataset tensor:', (79726, 64, 64))
('Mean:', -0.097465999)
('Standard deviation:', 0.33075851)
('Labels:', (79726,))
Label Knts:
-1    79726
dtype: int64
In [ ]:
 
In [11]:
# Add ability to display multiple images if only one class ? 
def mydisplayImages(obsIdn, obsFtr, obsRsp):
    clsSrs = pd.Series(obsRsp).value_counts().sort_values()
    nRow = clsSrs.shape[0]; nCol = 3
    figs, axes = plt.subplots(nRow, nCol, 
                          figsize=(6 * nCol, 6 * nRow))
    [(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) 
         for ax in axes.flatten()]
    if (len(clsSrs.index) > 1):
        for i, cls in enumerate(clsSrs.index):
            obsClsIx = np.array(range(obsRsp.shape[0]))[obsRsp == cls]
            for j, smpIx in enumerate(
                    np.random.randint(0, len(obsClsIx), nCol)):
                if glbImg['color']:
                    axes[i, j].imshow(obsFtr[obsClsIx[smpIx], :, :])
                else:    
                    axes[i, j].imshow(obsFtr[obsClsIx[smpIx], :, :], cmap = 'gray')
                if (cls >= 0):    
                    axes[i, j].set_title(glbRspClass[cls] + ':' + 
                                         obsIdn[obsClsIx[smpIx]])
                else:    
                    axes[i, j].set_title('new:' + 
                                         obsIdn[obsClsIx[smpIx]])
    else:
        cls = clsSrs.index[0]
        obsClsIx = np.array(range(obsRsp.shape[0]))[obsRsp == cls]
        for j, smpIx in enumerate(
                    np.random.randint(0, len(obsClsIx), nCol)):
                if glbImg['color']:
                    axes[j].imshow(obsFtr[obsClsIx[smpIx], :, :])
                else:    
                    axes[j].imshow(obsFtr[obsClsIx[smpIx], :, :], cmap = 'gray')
                if (cls >= 0):    
                    axes[j].set_title(glbRspClass[cls] + ':' + 
                                         obsIdn[obsClsIx[smpIx]])
                else:    
                    axes[j].set_title('new:' + 
                                         obsIdn[obsClsIx[smpIx]])
    
    plt.show()
    
print 'glbObsNew set:'; 
mydisplayImages(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
glbObsNew set:
In [15]:
print 'glbObsNew set:'; 
mydisplayImages(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
glbObsNew set:
In [21]:
# smpNImg = 3
# smpObsNewIx = np.random.randint(0, len(glbObsNewIdn), size = smpNImg ** 2)
# smpObsNewIx = smpObsNewIx.reshape((smpNImg, smpNImg))
# # print smpObsNewIx

# nRow = smpNImg; nCol = smpNImg    
# figs, axes = plt.subplots(nRow, nCol, 
#                           figsize=(6 * nCol, 6 * nRow))
# [(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
# for i, smpIx in enumerate(range(smpObsNewIx.shape[0])):
#     for j, smpJx in enumerate(range(smpObsNewIx.shape[1])):
#         if glbImg['color']:
#             axes[i, j].imshow(glbObsNewFtr[smpObsNewIx[i, j], :, :])
#         else:    
#             axes[i, j].imshow(glbObsNewFtr[smpObsNewIx[i, j], :, :]
#                               , cmap = 'gray')
#         axes[i, j].set_title('new:' + glbObsNewIdn[smpObsNewIx[i, j]])
[[63723 19273 48330]
 [72835 13242 22025]
 [10341 69496  2453]]
In [117]:
# print 'New set:'; mydisplayImages(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
New set:
  imgIx: 6018; id: img_14973.jpg; label: None
  imgIx: 54909; id: img_71431.jpg; label: None
  imgIx: 17412; id: img_28095.jpg; label: None
  imgIx: 66441; id: img_84692.jpg; label: None
  imgIx: 29698; id: img_42352.jpg; label: None
  imgIx: 21633; id: img_33020.jpg; label: None
  imgIx: 67069; id: img_85415.jpg; label: None
  imgIx: 46206; id: img_61395.jpg; label: None
  imgIx: 55126; id: img_71687.jpg; label: None
  imgIx: 53542; id: img_69827.jpg; label: None
imgIx: 61376: label: None
imgIx: 57757: label: None
imgIx: 47410: label: None
imgIx: 50825: label: None
imgIx: 36676: label: None
imgIx: 30483: label: None
imgIx: 37873: label: None
imgIx: 40371: label: None
imgIx: 28404: label: None
imgIx: 36545: label: None
In [49]:
# def load(data_folders, min_num_images, nImgMax):
#   dataset = np.ndarray(
#     shape=(nImgMax, glbImg['size'], glbImg['size']), dtype=np.float32)
#   labels = np.ndarray(shape=(nImgMax), dtype=np.int32)
#   label_index = 0
#   image_index = 0
#   for folder in data_folders:
#     print(folder)
#     for image in os.listdir(folder):
#       if image_index >= nImgMax:
#         raise Exception('More images than expected: %d >= %d' % (
#           image_index, nImgMax))
#       image_file = os.path.join(folder, image)
#       try:
#         image_data = (ndimage.imread(image_file).astype(float) -
#                       glbImgPixelDepth / 2) / glbImgPixelDepth
#         if image_data.shape != (glbImg['size'], glbImg['size']):
#           raise Exception('Unexpected image shape: %s' % str(image_data.shape))
#         dataset[image_index, :, :] = image_data
#         labels[image_index] = label_index
#         image_index += 1
#       except IOError as e:
#         print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
#     label_index += 1
#   num_images = image_index
#   dataset = dataset[0:num_images, :, :]
#   labels = labels[0:num_images]
#   if num_images < min_num_images:
#     raise Exception('Many fewer images than expected: %d < %d' % (
#         num_images, min_num_images))
#   print('Full dataset tensor:', dataset.shape)
#   print('Mean:', np.mean(dataset))
#   print('Standard deviation:', np.std(dataset))
#   print('Labels:', labels.shape)
#   return dataset, labels

# glbObsTrnFtr, glbObsTrnRsp = load(train_folders, 450000, 550000)
# glbObsNewFtr, glbObsNewRsp = load(test_folders, 18000, 20000)
In [110]:
#print type(glbObsTrnRsp); print glbObsTrnRsp.shape; print glbObsTrnRsp[0:10]
# print np.sum(glbObsTrnRsp == 0)
# print np.unique(glbObsTrnRsp)
# print 'train labels freqs: %s' % \
#     ([np.sum(glbObsTrnRsp == thsLabel) for thsLabel in np.unique(glbObsTrnRsp)])

Scrub data

If # of corrections > 10 ???

In [23]:
# Refer to glbDataScrub

Export database

Save imported data.

In [60]:
glbPickleFile
Out[60]:
{'data': 'data/img_D_SFDD_ImgSz_64.pickle',
 'models': 'data/img_M_SFDD_ImgSz_64.pickle'}
In [ ]:
try:
  f = open(glbPickleFile['data'], 'wb')
  save = {
    'glbObsTrnIdn': glbObsTrnIdn,
    'glbObsTrnFtr': glbObsTrnFtr,
    'glbObsTrnRsp': glbObsTrnRsp,
#     'glbObsVldFtr': glbObsVldFtr,
#     'glbObsVldRsp': glbObsVldRsp,
    'glbObsNewIdn': glbObsNewIdn,
    'glbObsNewFtr': glbObsNewFtr,
    'glbObsNewRsp': glbObsNewRsp,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', glbPickleFile['data'], ':', e)
  raise
    
statinfo = os.stat(glbPickleFile['data'])
print('Compressed Data pickle size:', statinfo.st_size)    
In [8]:
with open('data/img_D_SFDD_ImgSz_64_unshuffled.pickle', 'rb') as f:
# with open(glbPickleFile['data'], 'rb') as f:
  save = pickle.load(f)

  glbObsTrnIdn = save['glbObsTrnIdn']
  glbObsTrnFtr = save['glbObsTrnFtr']
  glbObsTrnRsp = save['glbObsTrnRsp']
    
  glbObsNewIdn = save['glbObsNewIdn']
  glbObsNewFtr = save['glbObsNewFtr']
  glbObsNewRsp = save['glbObsNewRsp']

  del save  # hint to help gc free up memory

  print('Trn set:', len(glbObsTrnIdn), glbObsTrnFtr.shape, 
                    glbObsTrnRsp.shape)
  print('New set:', len(glbObsNewIdn), glbObsNewFtr.shape, 
                    glbObsNewRsp.shape)
('Trn set:', 22424, (22424, 64, 64), (22424,))
('New set:', 79726, (79726, 64, 64), (79726,))

Shuffle data

Next, we'll randomize the data. It's important to have the labels well shuffled for the training and test distributions to match.

In [9]:
np.random.seed(glbObsShuffleSeed)
def randomize(ids, dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_ids = [ids[ix] for ix in permutation]
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_ids, shuffled_dataset, shuffled_labels

glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp = \
    randomize(glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp)
# glbObsNewIdn, glbObsNewFtr, glbObsNewRsp = \
#     randomize(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
In [60]:
# np.random.seed(133)
# def randomize(dataset, labels):
#   permutation = np.random.permutation(labels.shape[0])
#   shuffled_dataset = dataset[permutation,:,:]
#   shuffled_labels = labels[permutation]
#   return shuffled_dataset, shuffled_labels
# glbObsTrnFtr, glbObsTrnRsp = randomize(glbObsTrnFtr, glbObsTrnRsp)
# glbObsNewFtr, glbObsNewRsp = randomize(glbObsNewFtr, glbObsNewRsp)

Check if data is still good after shuffling!

In [12]:
print 'shuffled Trn set:'; 
mydisplayImages(glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp)
# print 'shuffled New set:'; 
# mydisplayImages(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
shuffled Trn set:

Corrections found here

In [17]:
# plt.imshow(myreadImage(trnFoldersPth + '/c8/img_25438.jpg'))

# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_382.jpg')) # Debatable
Out[17]:
<matplotlib.image.AxesImage at 0x10b05a8d0>

Prune the training data as needed. Depending on your computer setup, you might not be able to fit it all in memory, and you can tune obsTrnN as needed.

Also create a validation dataset for hyperparameter tuning.

This Project

Partition using drivers as a 'group'. Check which subject's images have higher correlation with the test set images.

In [8]:
def mygetCorrObs(xArr, yArr, 
                 xRowsN = None, yRowsN = None, chunkSize = None):
    thsBgnTm = datetime.now()    
    
    if xRowsN == None: xRowsN = int(xArr.shape[0])
    if yRowsN == None: yRowsN = int(yArr.shape[0])
    if chunkSize == None: chunkSize = int(min(xRowsN, yRowsN))
    else: assert (chunkSize <= xRowsN) or (chunkSize <= yRowsN), \
        'invalid chunkSize: %d, should be None' + \
        ' or <= %5d (xRowsN) or <= %5d (yRowsN)' % \
            (chunkSize, xRowsN, yRowsN)
#     assert chunkSize < 10000, \
#         'chunkSize: %d too large; terminating ...' % (chunkSize)   
        
    xMtx = np.reshape(xArr[:xRowsN], 
                    (xRowsN, xArr.shape[1] * xArr.shape[2]))
    yMtx = np.reshape(yArr[:yRowsN], 
                    (yRowsN, yArr.shape[1] * yArr.shape[2]))
            
    corMtx = np.zeros((xRowsN, yRowsN))
    corMtx[:,:] = np.nan 
    
    for rowIx in xrange(0, int(xRowsN), int(chunkSize)):
        if (rowIx + chunkSize > xRowsN): break
        if ((datetime.now() - thsBgnTm).seconds > 60) and \
            mydspVerboseTrigger(rowIx):
                print '  (at %5d secs) chunkSize: %5d; rowIx: %5d' % \
                    ((datetime.now() - thsBgnTm).seconds, chunkSize, rowIx)
        for colIx in xrange(0, int(yRowsN), int(chunkSize)):
            if (colIx + chunkSize > yRowsN): break            
            corMtx[(rowIx):(rowIx + chunkSize), 
                       (colIx):(colIx + chunkSize)] = \
                    np.corrcoef(xMtx[(rowIx):(rowIx + chunkSize), :], 
                        yMtx[(colIx):(colIx + chunkSize), :])[:chunkSize, 
                                                              chunkSize:]
                    
    if (chunkSize == 1): pass
    else: 
        # x Boundary condition        
        dffXIx = xRowsN % chunkSize
        dffYIx = yRowsN % chunkSize
        if (dffXIx > 0):
            if ((datetime.now() - thsBgnTm).seconds > 60):
                    print '  (at %5d secs) chunkSize: %5d; dffXIx: %5d' % \
                    ((datetime.now() - thsBgnTm).seconds, chunkSize, dffXIx)                        
            assert dffXIx < 10000, \
                'dffXIx: %d too large; terminating ...' % (dffXIx)                       
            for colIx in xrange(0, int(yRowsN), int(chunkSize)):
                if (colIx + chunkSize > yRowsN): break
#                 print '  dffXIx: xArr Rows: (%5d:%5d); yArr Rows: (%5d:%5d)' % \
#                     ((xRowsN - dffXIx), (xRowsN), (colIx), (colIx + chunkSize))
                corMtx[(xRowsN - dffXIx):(xRowsN), 
                       (colIx):(colIx + chunkSize)] = \
                        np.corrcoef(xMtx[(xRowsN - dffXIx):(xRowsN), :], 
                                    yMtx[(colIx):(colIx + chunkSize), :])[
                            :dffXIx, dffXIx:]
                                        
        # y Boundary condition
        if (dffYIx > 0):
#             assert True, 'mygetCorrObs: not implemented yet for dffYIx > 0'
            if ((datetime.now() - thsBgnTm).seconds > 60):
                    print '  (at %5d secs) chunkSize: %5d; dffYIx: %5d' % \
                    ((datetime.now() - thsBgnTm).seconds, chunkSize, dffYIx)                        
            assert dffYIx < 10000, \
                'dffYIx: %d too large; terminating ...' % (dffYIx)                       
            for rowIx in xrange(0, int(xRowsN), int(chunkSize)):
                if (rowIx + chunkSize > xRowsN): break
#                 print '  dffYIx: xArr Rows: (%5d:%5d); yArr Rows: (%5d:%5d)' % \
#                     ((rowIx), (rowIx + chunkSize), (yRowsN - dffYIx), (yRowsN))
                corMtx[(rowIx):(rowIx + chunkSize),
                       (yRowsN - dffYIx):(yRowsN)] = \
                        np.corrcoef(xMtx[(rowIx):(rowIx + chunkSize), :],
                                    yMtx[(yRowsN - dffYIx):(yRowsN), :])[
                            :chunkSize, chunkSize:]
                        
        # x & y Boundary condition        
        if (dffXIx > 0) or (dffYIx > 0):
            if ((datetime.now() - thsBgnTm).seconds > 60):
                print '  (at %5d secs) chunkSize: 1; xRowsN - dffXIx: %5d; yRowsN - dffYIx: %5d' % \
        ((datetime.now() - thsBgnTm).seconds, xRowsN - dffXIx, yRowsN - dffYIx)                        

#             assert dffXIx * dffYIx < 10000, \
#         'dffXIx*YIx: %d too large; dffXIx: %d; dffYIx: %d; terminating...' % \
#                 (dffXIx * dffYIx, dffXIx, dffYIx)
            for rowIx in xrange(int(xRowsN - dffXIx), int(xRowsN)):
                thsDrn = (datetime.now() - thsBgnTm).seconds
                if (thsDrn > 60) and \
                    mydspVerboseTrigger(rowIx):
                        print '  (at %d secs) chunkSize: 1; rowIx: %5d' % (thsDrn, rowIx)                
                for colIx in xrange(int(yRowsN - dffYIx), int(yRowsN)):
                    corMtx[rowIx:(rowIx + 1), colIx:(colIx + 1)] = \
                        np.corrcoef(xMtx[rowIx:(rowIx + 1), :], 
                                    yMtx[colIx:(colIx + 1), :])[:1, 1:]        
                                        
    assert (corMtx[:,:] != np.nan).all(), 'some cells in corMtx == nan'
            
    return pd.DataFrame({'mean'  : np.nanmean(corMtx),
                         'median': np.nanmedian(corMtx),
                         'min'   : np.nanmin(corMtx),
                         'max'   : np.nanmax(corMtx),
                         'xRowsN'   : xRowsN,
                         'yRowsN'   : yRowsN, 
                        'duration': (datetime.now() - thsBgnTm).seconds,
                      'chunkSize': chunkSize
                        }, index = [0])
    
# print mygetCorrObs(glbObsTrnFtr[:11], glbObsNewFtr[:7], chunkSize = 1)
# print mygetCorrObs(glbObsTrnFtr[:11], glbObsNewFtr[:7], chunkSize = 3)
# print mygetCorrObs(glbObsTrnFtr[:11], glbObsNewFtr[:7], chunkSize = 9)
# print mygetCorrObs(glbObsTrnFtr[:11], glbObsNewFtr[:7])

print mygetCorrObs(glbObsFitFtr[:11], glbObsNewFtr[:7], chunkSize = 1)
print mygetCorrObs(glbObsFitFtr[:11], glbObsNewFtr[:7], chunkSize = 3)
print mygetCorrObs(glbObsFitFtr[:11], glbObsNewFtr[:7], chunkSize = 9)
print mygetCorrObs(glbObsFitFtr[:11], glbObsNewFtr[:7])

# NaN correlation
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[21164:(21164+10)])
   chunkSize  duration       max      mean    median       min  xRowsN  yRowsN
0          1         0  0.793803  0.526378  0.556235  0.104124      11       7
   chunkSize  duration       max      mean    median       min  xRowsN  yRowsN
0          3         0  0.793803  0.526378  0.556235  0.104124      11       7
   chunkSize  duration       max      mean    median       min  xRowsN  yRowsN
0          9         0  0.793803  0.526378  0.556235  0.104124      11       7
   chunkSize  duration       max      mean    median       min  xRowsN  yRowsN
0          7         0  0.793803  0.526378  0.556235  0.104124      11       7
In [144]:
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[10000:50000])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[10000:
#                                                   (10000+50000) / 4])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[
#                                 (10000+50000) / 4:
#                                 (10000+50000) / 2])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[
#                                 (15000+    0) / 1:
#                                 (15000+30000) / 2])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[
#                                 (20625+    0) / 1:
#                                 (20625+22500) / 2])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[
#                                 (21093+    0) / 1:
#                                 (21093+21210) / 2])
print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[
                                (21164+    0) / 1:
                                (    0+21165) / 1])
   chunkSize  duration  max  mean  median  min  xRowsN  yRowsN
0          1         0  NaN   NaN     NaN  NaN       5       1
In [148]:
print (glbObsNewFtr[21164] == -0.5).all()
True
In [12]:
tstGetCorrObsDf = None
# tstGetCorrObsDf = pd.DataFrame()
# tstGetCorrObsDf = tstGetCorrObsDf[tstGetCorrObsDf['mean'].notnull()]
In [13]:
%run img_utils.py
In [36]:
srchParamsDct = {
                'chunkSize'   : [300, 500, 700],
#                 'chunkSize'   : [1, 5, 10, 100, 200, 1000],
    
                'xRowsN'   : [346, 791, 1237],
#                 'xRowsN'   : [10, 100, 200, 346],
                            # 346 is min; 1237 is max by subject
    
                'yRowsN' : [79726],        
#                 'yRowsN' : [100, 1000, 10000, 20000, 50000, 79726],
                            # 79726 is len(glbObsNewIdn)
                }
jnk = mysearchParams(mygetCorrObs, srchParamsDct = srchParamsDct, 
               curResultsDf = tstGetCorrObsDf, 
               mode = 'displayonly', 
                save_filepathname = \
        'data/img_01_import_data_SFDD_ImgSz_64_tstGetCorrObsDf.pickle',                     
              xArr = glbObsTrnFtr, 
              yArr = glbObsNewFtr)
Running <function mygetCorrObs at 0x114861b18> with params:
                         chunkSize   yRowsN  xRowsN
chunkSize yRowsN xRowsN                            
5.0       10.0   5.0         300.0  79726.0   346.0
                 5.0         300.0  79726.0   791.0
                 5.0         300.0  79726.0  1237.0
                 5.0         500.0  79726.0   791.0
                 5.0         700.0  79726.0   346.0
                 5.0         700.0  79726.0   791.0
                 5.0         700.0  79726.0  1237.0
Compressed pickle file: data/img_01_import_data_SFDD_ImgSz_64_tstGetCorrObsDf.pickle; size: 4 KB
In [37]:
tstGetCorrObsDf = mysearchParams(mygetCorrObs, 
                    srchParamsDct = srchParamsDct, 
                    curResultsDf = tstGetCorrObsDf, 
                    mode = 'run', 
                    sort_values =    ['yRowsN', 'xRowsN', 'duration'], 
                    sort_ascending = [False   , False   , True      ],
                    save_filepathname = \
        'data/img_01_import_data_SFDD_ImgSz_64_tstGetCorrObsDf.pickle',
                    xArr = glbObsTrnFtr, 
                    yArr = glbObsNewFtr)
# print tstGetCorrObsDf
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:12: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:14: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:17: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:28: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:29: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:30: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:47: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:48: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:49: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:63: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:64: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:65: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  chunkSize:   300; dffXIx:    37
  chunkSize:   300; dffYIx:   226
  chunkSize: 1; xRowsN - dffXIx:  1200; yRowsN - dffYIx: 79500
  (at 72 secs) chunkSize: 1; rowIx:   800
                         bestFit       min       max    median  duration  \
chunkSize yRowsN  xRowsN                                                   
500.0     79726.0 1237.0   False -0.187040  0.946158  0.536080        74   
300.0     79726.0 1237.0     NaN -0.187040  0.946158  0.536080        90   
1000.0    79726.0 1237.0   False -0.187040  0.946158  0.536080        91   
700.0     79726.0 1237.0     NaN -0.187040  0.946158  0.536080       134   
100.0     79726.0 1237.0   False -0.187040  0.946158  0.536080       190   
500.0     79726.0 791.0      NaN -0.187040  0.946158  0.533224        59   
700.0     79726.0 791.0      NaN -0.187040  0.946158  0.533224        60   
300.0     79726.0 791.0      NaN -0.187040  0.946158  0.533224        66   
500.0     79726.0 346.0     True -0.187040  0.946158  0.532195        33   
300.0     79726.0 346.0      NaN -0.187040  0.946158  0.532195        35   
100.0     79726.0 346.0    False -0.187040  0.946158  0.532195        54   
700.0     79726.0 346.0      NaN -0.187040  0.946158  0.532195        66   
1000.0    79726.0 346.0    False -0.187040  0.946158  0.532195        74   
2000.0    79726.0 346.0    False -0.187040  0.946158  0.532195       145   
1000.0    50000.0 200.0    False -0.187040  0.945474  0.534401        11   
100.0     50000.0 200.0    False -0.187040  0.945474  0.534401        18   
5.0       50000.0 5.0      False -0.023532  0.854622  0.532842         5   
1.0       50000.0 5.0      False -0.023532  0.854622  0.532842        49   
2000.0    20000.0 200.0    False -0.186376  0.945474  0.534865         6   
1000.0    20000.0 200.0    False -0.186376  0.945474  0.534865        41   
100.0     20000.0 200.0    False -0.186376  0.945474  0.534865        48   
5.0       20000.0 5.0      False -0.023532  0.854622  0.532970         2   
1.0       20000.0 5.0      False -0.023532  0.854622  0.532970        19   
100.0     10000.0 100.0    False -0.186376  0.945474  0.537716        13   
10.0      10000.0 100.0    False -0.186376  0.945474  0.537716        19   
5.0       10000.0 100.0    False -0.186376  0.945474  0.537716        31   
1.0       10000.0 100.0    False -0.186376  0.945474  0.537716       236   
10.0      10000.0 10.0     False -0.023532  0.864736  0.542571        11   
5.0       10000.0 10.0     False -0.023532  0.864736  0.542571        12   
1.0       10000.0 10.0     False -0.023532  0.864736  0.542571        24   
5.0       10000.0 5.0      False -0.023532  0.854622  0.533349         1   
1.0       10000.0 5.0      False -0.023532  0.854622  0.533349        10   
10.0      1000.0  100.0    False -0.169296  0.934644  0.536288         1   
5.0       1000.0  100.0    False -0.169296  0.934644  0.536288         2   
1.0       1000.0  100.0    False -0.169296  0.934644  0.536288        24   
5.0       1000.0  10.0     False -0.023532  0.862511  0.542305         0   
10.0      1000.0  10.0     False -0.023532  0.862511  0.542305         0   
1.0       1000.0  10.0     False -0.023532  0.862511  0.542305         2   
5.0       1000.0  5.0      False -0.023532  0.836999  0.533127         0   
10.0      1000.0  5.0      False -0.023532  0.836999  0.533127         0   
1.0       1000.0  5.0      False -0.023532  0.836999  0.533127         1   
          100.0   10.0     False  0.070164  0.858833  0.549639         0   
5.0       100.0   10.0     False  0.070164  0.858833  0.549639         0   
10.0      100.0   10.0     False  0.070164  0.858833  0.549639         0   
1.0       100.0   5.0      False  0.070164  0.794519  0.542296         0   
5.0       100.0   5.0      False  0.070164  0.794519  0.542296         0   
10.0      100.0   5.0      False  0.070164  0.794519  0.542296         0   
1.0       10.0    5.0      False  0.104124  0.720964  0.540077         0   
5.0       10.0    5.0      False  0.104124  0.720964  0.540077         0   

                              mean  
chunkSize yRowsN  xRowsN            
500.0     79726.0 1237.0  0.532151  
300.0     79726.0 1237.0  0.532151  
1000.0    79726.0 1237.0  0.532151  
700.0     79726.0 1237.0  0.532151  
100.0     79726.0 1237.0  0.532151  
500.0     79726.0 791.0   0.529832  
700.0     79726.0 791.0   0.529832  
300.0     79726.0 791.0   0.529832  
500.0     79726.0 346.0   0.529152  
300.0     79726.0 346.0   0.529152  
100.0     79726.0 346.0   0.529152  
700.0     79726.0 346.0   0.529152  
1000.0    79726.0 346.0   0.529152  
2000.0    79726.0 346.0   0.529152  
1000.0    50000.0 200.0   0.529535  
100.0     50000.0 200.0   0.529535  
5.0       50000.0 5.0     0.516832  
1.0       50000.0 5.0     0.516832  
2000.0    20000.0 200.0   0.529921  
1000.0    20000.0 200.0   0.529921  
100.0     20000.0 200.0   0.529921  
5.0       20000.0 5.0     0.517077  
1.0       20000.0 5.0     0.517077  
100.0     10000.0 100.0   0.533284  
10.0      10000.0 100.0   0.533284  
5.0       10000.0 100.0   0.533284  
1.0       10000.0 100.0   0.533284  
10.0      10000.0 10.0    0.536553  
5.0       10000.0 10.0    0.536553  
1.0       10000.0 10.0    0.536553  
5.0       10000.0 5.0     0.517284  
1.0       10000.0 5.0     0.517284  
10.0      1000.0  100.0   0.531372  
5.0       1000.0  100.0   0.531372  
1.0       1000.0  100.0   0.531372  
5.0       1000.0  10.0    0.534948  
10.0      1000.0  10.0    0.534948  
1.0       1000.0  10.0    0.534948  
5.0       1000.0  5.0     0.514484  
10.0      1000.0  5.0     0.514484  
1.0       1000.0  5.0     0.514484  
          100.0   10.0    0.541505  
5.0       100.0   10.0    0.541505  
10.0      100.0   10.0    0.541505  
1.0       100.0   5.0     0.523201  
5.0       100.0   5.0     0.523201  
10.0      100.0   5.0     0.523201  
1.0       10.0    5.0     0.510330  
5.0       10.0    5.0     0.510330  
Compressed pickle file: data/img_01_import_data_SFDD_ImgSz_64_tstGetCorrObsDf.pickle; size: 4 KB
In [38]:
tstGetCorrObsDf['bestFit'] = False
tstGetCorrObsDf.ix[(500.0, 79726.0, 346.0), 'bestFit'] = True
# 1000.0    79726.0 346.0
print tstGetCorrObsDf[tstGetCorrObsDf.yRowsN >= 70000]
                         bestFit  chunkSize  duration       max      mean  \
chunkSize yRowsN  xRowsN                                                    
500.0     79726.0 1237.0   False      500.0        74  0.946158  0.532151   
300.0     79726.0 1237.0   False      300.0        90  0.946158  0.532151   
1000.0    79726.0 1237.0   False     1000.0        91  0.946158  0.532151   
700.0     79726.0 1237.0   False      700.0       134  0.946158  0.532151   
100.0     79726.0 1237.0   False      100.0       190  0.946158  0.532151   
500.0     79726.0 791.0    False      500.0        59  0.946158  0.529832   
700.0     79726.0 791.0    False      700.0        60  0.946158  0.529832   
300.0     79726.0 791.0    False      300.0        66  0.946158  0.529832   
500.0     79726.0 346.0     True      500.0        33  0.946158  0.529152   
300.0     79726.0 346.0    False      300.0        35  0.946158  0.529152   
100.0     79726.0 346.0    False      100.0        54  0.946158  0.529152   
700.0     79726.0 346.0    False      700.0        66  0.946158  0.529152   
1000.0    79726.0 346.0    False     1000.0        74  0.946158  0.529152   
2000.0    79726.0 346.0    False     2000.0       145  0.946158  0.529152   

                            median      min  xRowsN   yRowsN  
chunkSize yRowsN  xRowsN                                      
500.0     79726.0 1237.0  0.536080 -0.18704  1237.0  79726.0  
300.0     79726.0 1237.0  0.536080 -0.18704  1237.0  79726.0  
1000.0    79726.0 1237.0  0.536080 -0.18704  1237.0  79726.0  
700.0     79726.0 1237.0  0.536080 -0.18704  1237.0  79726.0  
100.0     79726.0 1237.0  0.536080 -0.18704  1237.0  79726.0  
500.0     79726.0 791.0   0.533224 -0.18704   791.0  79726.0  
700.0     79726.0 791.0   0.533224 -0.18704   791.0  79726.0  
300.0     79726.0 791.0   0.533224 -0.18704   791.0  79726.0  
500.0     79726.0 346.0   0.532195 -0.18704   346.0  79726.0  
300.0     79726.0 346.0   0.532195 -0.18704   346.0  79726.0  
100.0     79726.0 346.0   0.532195 -0.18704   346.0  79726.0  
700.0     79726.0 346.0   0.532195 -0.18704   346.0  79726.0  
1000.0    79726.0 346.0   0.532195 -0.18704   346.0  79726.0  
2000.0    79726.0 346.0   0.532195 -0.18704   346.0  79726.0  
In [39]:
robjects.pandas2ri.activate()
pltRDf = robjects.conversion.py2ri(tstGetCorrObsDf[tstGetCorrObsDf.xRowsN >= 346])
# print(pltRDf)
pltRFn = robjects.r("""
    source('~/Dropbox/datascience/R/myplot.R')
    function(RDf, filename) {
        mypltModelStats(RDf, c('mean', 'median', 'duration'), 
            dim = c('chunkSize', 'yRowsN','xRowsN'), 
                scaleXFn = NULL, 
                #highLightIx = which.min(RDf$logLossVld),
                highLightIx = which(RDf$bestFit == 'TRUE'),                
            title = NULL, 
            fileName = filename)
    }                        
""")    
pltRFn(pltRDf, 'img_01_import_data_SFDD_tstGetCorrObsDf.png')

pltRFn = robjects.r("""
    source('~/Dropbox/datascience/R/myplot.R')
    function(RDf, filename) {
        mypltModelStats(RDf, c('mean', 'median'), 
            dim = c('chunkSize', 'yRowsN','xRowsN'), 
                scaleXFn = NULL, 
                #highLightIx = which.min(RDf$logLossVld),
                highLightIx = which(RDf$bestFit == 'TRUE'),                
            title = NULL, 
            fileName = filename)
    }                        
""")    
pltRFn(pltRDf, 'img_01_import_data_SFDD_tstGetCorrObsDf_nodur.png')
Out[39]:
<ListVector - Python:0x114a13d40 / R:0x7fbf7df378b0>
[DataF..., ListV..., Envir..., ..., ListV..., Envir..., ListV...]
<ListVector - Python:0x114a13d40 / R:0x7fbf7df378b0>
[DataF..., ListV..., Envir..., ..., ListV..., Envir..., ListV...]
<ListVector - Python:0x114a13d40 / R:0x7fbf7df378b0>
[DataF..., ListV..., Envir..., ..., ListV..., Envir..., ListV...]
  scales: <class 'rpy2.robjects.environments.Environment'>
  <Environment - Python:0x115fd2c20 / R:0x7fbf7ddbec78>
  ...
<ListVector - Python:0x114a13d40 / R:0x7fbf7df378b0>
[DataF..., ListV..., Envir..., ..., ListV..., Envir..., ListV...]
  layers: <class 'rpy2.robjects.environments.Environment'>
  <Environment - Python:0x1147115a8 / R:0x7fbf7dac4958>
<ListVector - Python:0x114a13d40 / R:0x7fbf7df378b0>
[DataF..., ListV..., Envir..., ..., ListV..., Envir..., ListV...]
In [32]:
tstGetCorrObsDf.to_csv('data/img_01_import_data_SFDD_ImgSz_64_tstGetCorrObsDf.csv')
In [170]:
# def mygetCorrObs(xArr, yArr):
#     xRowsN = xArr.shape[0] 
#     yRowsN = yArr.shape[0]     
#     corMtx = np.corrcoef(np.reshape(xArr, 
#                     (xArr.shape[0], xArr.shape[1] * xArr.shape[2])),
#                          np.reshape(yArr, 
#                     (yArr.shape[0], yArr.shape[1] * yArr.shape[2])))
# #     print corMtx.shape
#     corMtx = corMtx[:xRowsN, xRowsN:]
# #     print corMtx
# #     print 'corMtx: min: %.4f; max: %.4f; avg: %.4f;' % \
# #         (np.min(corMtx), np.max(corMtx), np.mean(corMtx))
            
#     return pd.DataFrame({'mean'  : np.mean(corMtx),
#                          'median': np.median(corMtx),
#                          'min'   : np.min(corMtx),
#                          'max'   : np.max(corMtx),
#                          'x.n'   : np.shape(xArr)[0],
#                          'y.n'   : np.shape(yArr)[0],       
#                         }, index = [0])
    
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[:10])
In [12]:
# drvTrnSbt = driverDf.subject.unique()
drvSbtDf = (pd.DataFrame({'Trn.Images.N': driverDf['subject'].value_counts()})
            .sort_values('Trn.Images.N'))
print drvSbtDf
      Trn.Images.N
p072           346
p042           591
p041           605
p039           651
p045           724
p002           725
p052           740
p050           790
p056           794
p061           809
p075           814
p064           820
p012           823
p081           823
p047           835
p035           848
p015           875
p014           876
p051           920
p049          1011
p066          1034
p016          1078
p026          1196
p024          1226
p022          1233
p021          1237
In [12]:
# srhSqnNewCorDf = smpSqnNewCorDf
# srhSqnNewCorDf = pd.DataFrame()
In [13]:
def getSbtNewObsCorrelation(sbt, lclObsTrnIdn, lclObsTrnFtr, 
                            verbose = False):

    thsBgnTm = datetime.now()    

    if verbose:
        print 'getSbtNewObsCorrelation: sbt: %s' % (sbt)    
        
    tmpSet = set(driverDf[driverDf.subject.isin([sbt])]['img'])
    sbtObsTrnIdx = [ix for ix in xrange(len(lclObsTrnIdn)) 
                        if lclObsTrnIdn[ix] in 
                        tmpSet]
    sbtObsTrnFtr = lclObsTrnFtr[sbtObsTrnIdx,:,:]
#     if verbose:
#         print '  sbtObsTrnFtr duration: %0.2f seconds' % \
#             ((datetime.now() - thsBgnTm).seconds)    
        
    thsBgnTm = datetime.now()        
    corDf = mygetCorrObs(sbtObsTrnFtr, glbObsNewFtr, chunkSize = 500)
    
    corDf['subject'] = sbt
    if verbose:
        print '  corDf duration: %0.2f seconds' % \
            ((datetime.now() - thsBgnTm).seconds)    
    
    return corDf
    
thsBgnTm = datetime.now() 
retLst = []
for sbt in drvSbtDf.index[:3]:    
    retLst.append(getSbtNewObsCorrelation(sbt, 
                                glbObsTrnIdn, glbObsTrnFtr,
                                         verbose = True))
    
smpSqnNewCorDf = pd.DataFrame()
for df in retLst:
    smpSqnNewCorDf = smpSqnNewCorDf.append(df)
    
print 'smp getSbtNewObsCorrelation sequential duration: %0.2f seconds' % \
    ((datetime.now() - thsBgnTm).seconds)     
print smpSqnNewCorDf    
getSbtNewObsCorrelation: sbt: p072
  corDf duration: 36.00 seconds
getSbtNewObsCorrelation: sbt: p042
  corDf duration: 42.00 seconds
getSbtNewObsCorrelation: sbt: p041
  corDf duration: 44.00 seconds
smp getSbtNewObsCorrelation sequential duration: 122.00 seconds
   chunkSize  duration       max      mean    median       min  xRowsN  \
0        500        36  0.896713  0.573754  0.580933 -0.019778     346   
0        500        42  0.866144  0.483161  0.503277 -0.111311     591   
0        500        44  0.846570  0.519290  0.510401  0.023227     605   

   yRowsN subject  
0   79726    p072  
0   79726    p042  
0   79726    p041  

Parallel test does not work. crashes all the time

In [ ]:
thsBgnTm = datetime.now()    
retLst = Parallel(n_jobs = nCores, verbose = 2)(delayed(
        getSbtNewObsCorrelation)(sbt, glbObsTrnIdn, glbObsTrnFtr) 
                                                for sbt in drvSbtDf.index[:3])

# print retLst    
smpPrlNewCorDf = pd.DataFrame()
for df in retLst:
#     print 'type(df): %s' % (str(type(df)))
#     print 'type(drvNewCorDf): %s' % (str(type(drvNewCorDf)))    
    smpPrlNewCorDf = smpPrlNewCorDf.append(df)
    
print 'smp getSbtNewObsCorrelation parallel duration: %0.2f seconds' % \
    ((datetime.now() - thsBgnTm).seconds)     
print smpPrlNewCorDf    
        
assert smpSqnNewCorDf.equals(smpPrlNewCorDf), \
    'smpSqnNewCorDf != smpPrlNewCorDf'
In [ ]:
thsBgnTm = datetime.now()    
retLst = Parallel(n_jobs = nCores, verbose = 2)(delayed(
        getSbtNewObsCorrelation)(sbt, 
                                 glbObsTrnIdn[:5000], 
                                 glbObsTrnFtr[:5000]) 
                                                for sbt in drvTrnSbt[:3])

# print retLst    
smpPrlNewCorDf = pd.DataFrame()
for df in retLst:
#     print 'type(df): %s' % (str(type(df)))
#     print 'type(drvNewCorDf): %s' % (str(type(drvNewCorDf)))    
    smpPrlNewCorDf = smpPrlNewCorDf.append(df)
    
print 'getSbtNewObsCorrelation parallel duration: %0.2f seconds' % \
    ((datetime.now() - thsBgnTm).seconds)     
print smpPrlNewCorDf    
        
thsBgnTm = datetime.now()    
retLst = []
for sbt in drvTrnSbt[:3]:    
    retLst.append(getSbtNewObsCorrelation(sbt, 
                                glbObsTrnIdn[:5000], glbObsTrnFtr[:5000]))
    
# print retLst    
smpSqnNewCorDf = pd.DataFrame()
for df in retLst:
    smpSqnNewCorDf = smpSqnNewCorDf.append(df)
    
print 'getSbtNewObsCorrelation sequential duration: %0.2f seconds' % \
    ((datetime.now() - thsBgnTm).seconds)     
print smpPrlNewCorDf    

assert smpSqnNewCorDf.equals(smpPrlNewCorDf), \
    'smpSqnNewCorDf != smpPrlNewCorDf'
In [ ]:
thsBgnTm = datetime.now()    
retLst = Parallel(n_jobs = nCores, verbose = 2)(delayed(
        getSbtNewObsCorrelation)(sbt, 
                                 glbObsTrnIdn, 
                                 glbObsTrnFtr) 
                                                for sbt in drvTrnSbt)

drvNewCorDf = pd.DataFrame()
for df in retLst:
    drvNewCorDf = drvNewCorDf.append(df)
    
print 'getSbtNewObsCorrelation parallel duration: %0.2f seconds' % \
    ((datetime.now() - thsBgnTm).seconds)     
print drvNewCorDf    
In [14]:
thsBgnTm = datetime.now() 
retLst = []
for sbt in drvSbtDf.index:    
    retLst.append(getSbtNewObsCorrelation(sbt, 
                                glbObsTrnIdn, glbObsTrnFtr,
                                         verbose = True))
    
sbtNewCorDf = pd.DataFrame()
for df in retLst:
    sbtNewCorDf = sbtNewCorDf.append(df)
    
print 'sbt getSbtNewObsCorrelation sequential duration: %0.2f seconds' % \
    ((datetime.now() - thsBgnTm).seconds)     
print sbtNewCorDf.sort_values('mean', ascending = False)  
getSbtNewObsCorrelation: sbt: p072
  corDf duration: 36.00 seconds
getSbtNewObsCorrelation: sbt: p042
  corDf duration: 42.00 seconds
getSbtNewObsCorrelation: sbt: p041
  corDf duration: 43.00 seconds
getSbtNewObsCorrelation: sbt: p039
  corDf duration: 46.00 seconds
getSbtNewObsCorrelation: sbt: p045
  corDf duration: 52.00 seconds
getSbtNewObsCorrelation: sbt: p002
  corDf duration: 52.00 seconds
getSbtNewObsCorrelation: sbt: p052
  corDf duration: 53.00 seconds
getSbtNewObsCorrelation: sbt: p050
  corDf duration: 60.00 seconds
getSbtNewObsCorrelation: sbt: p056
  corDf duration: 60.00 seconds
getSbtNewObsCorrelation: sbt: p061
  corDf duration: 64.00 seconds
getSbtNewObsCorrelation: sbt: p075
  corDf duration: 60.00 seconds
getSbtNewObsCorrelation: sbt: p064
  corDf duration: 62.00 seconds
getSbtNewObsCorrelation: sbt: p012
  corDf duration: 62.00 seconds
getSbtNewObsCorrelation: sbt: p081
  corDf duration: 64.00 seconds
getSbtNewObsCorrelation: sbt: p047
  corDf duration: 65.00 seconds
getSbtNewObsCorrelation: sbt: p035
  corDf duration: 63.00 seconds
getSbtNewObsCorrelation: sbt: p015
  (at 61 secs) chunkSize: 1; rowIx:   800
  corDf duration: 68.00 seconds
getSbtNewObsCorrelation: sbt: p014
  (at 63 secs) chunkSize: 1; rowIx:   800
  corDf duration: 69.00 seconds
getSbtNewObsCorrelation: sbt: p051
  (at 65 secs) chunkSize: 1; rowIx:   800
  corDf duration: 74.00 seconds
getSbtNewObsCorrelation: sbt: p049
  corDf duration: 60.00 seconds
getSbtNewObsCorrelation: sbt: p066
  chunkSize:   500; dffYIx:   226
  chunkSize: 1; xRowsN - dffXIx:  1000; yRowsN - dffYIx: 79500
  corDf duration: 67.00 seconds
getSbtNewObsCorrelation: sbt: p016
  chunkSize:   500; dffYIx:   226
  chunkSize: 1; xRowsN - dffXIx:  1000; yRowsN - dffYIx: 79500
  corDf duration: 72.00 seconds
getSbtNewObsCorrelation: sbt: p026
  chunkSize:   500; dffYIx:   226
  chunkSize: 1; xRowsN - dffXIx:  1000; yRowsN - dffYIx: 79500
  corDf duration: 81.00 seconds
getSbtNewObsCorrelation: sbt: p024
  chunkSize:   500; dffYIx:   226
  chunkSize: 1; xRowsN - dffXIx:  1000; yRowsN - dffYIx: 79500
  corDf duration: 84.00 seconds
getSbtNewObsCorrelation: sbt: p022
  chunkSize:   500; dffYIx:   226
  chunkSize: 1; xRowsN - dffXIx:  1000; yRowsN - dffYIx: 79500
  corDf duration: 85.00 seconds
getSbtNewObsCorrelation: sbt: p021
  chunkSize:   500; dffYIx:   226
  chunkSize: 1; xRowsN - dffXIx:  1000; yRowsN - dffYIx: 79500
  corDf duration: 85.00 seconds
sbt getSbtNewObsCorrelation sequential duration: 1643.00 seconds
   chunkSize  duration       max      mean    median       min  xRowsN  \
0        500        84  0.936134  0.575738  0.578098 -0.032779    1226   
0        500        36  0.896713  0.573754  0.580933 -0.019778     346   
0        500        60  0.951831  0.571181  0.553923  0.038137     814   
0        500        51  0.911403  0.570485  0.574516  0.033160     724   
0        500        85  0.945552  0.567124  0.573081 -0.108175    1237   
0        500        62  0.893492  0.565394  0.556607  0.006713     820   
0        500        64  0.913507  0.561276  0.555566 -0.021651     809   
0        500        64  0.930973  0.558110  0.555655 -0.048058     823   
0        500        67  0.924473  0.556668  0.545573  0.002514    1034   
0        500        60  0.896334  0.556492  0.566102  0.008886     790   
0        500        72  0.887267  0.553708  0.556487 -0.067284    1078   
0        500        60  0.885953  0.542682  0.549579 -0.016990    1011   
0        500        53  0.897490  0.542406  0.552354 -0.056472     740   
0        500        81  0.925953  0.542128  0.541458 -0.148784    1196   
0        500        63  0.905781  0.538200  0.535851  0.012447     848   
0        500        84  0.926006  0.529561  0.529676 -0.040819    1233   
0        500        74  0.904321  0.527758  0.545097 -0.090799     920   
0        500        52  0.860108  0.521689  0.524201 -0.124219     725   
0        500        43  0.846570  0.519290  0.510401  0.023227     605   
0        500        46  0.833699  0.512349  0.520130 -0.069207     651   
0        500        60  0.894858  0.502463  0.528389 -0.122631     794   
0        500        42  0.866144  0.483161  0.503277 -0.111311     591   
0        500        62  0.889807  0.481822  0.479718 -0.193823     823   
0        500        68  0.843857  0.476219  0.476996 -0.205910     875   
0        500        69  0.891644  0.461138  0.455255 -0.209080     876   
0        500        65  0.882749  0.412253  0.418998 -0.133917     835   

   yRowsN subject  
0   79726    p024  
0   79726    p072  
0   79726    p075  
0   79726    p045  
0   79726    p021  
0   79726    p064  
0   79726    p061  
0   79726    p081  
0   79726    p066  
0   79726    p050  
0   79726    p016  
0   79726    p049  
0   79726    p052  
0   79726    p026  
0   79726    p035  
0   79726    p022  
0   79726    p051  
0   79726    p002  
0   79726    p041  
0   79726    p039  
0   79726    p056  
0   79726    p042  
0   79726    p012  
0   79726    p015  
0   79726    p014  
0   79726    p047  
In [15]:
try:
  f = open(glbPickleFile['data'], 'wb')
  save = {
    'glbObsTrnIdn': glbObsTrnIdn,
    'glbObsTrnFtr': glbObsTrnFtr,
    'glbObsTrnRsp': glbObsTrnRsp,
#     'glbObsVldFtr': glbObsVldFtr,
#     'glbObsVldRsp': glbObsVldRsp,
    'glbObsNewIdn': glbObsNewIdn,
    'glbObsNewFtr': glbObsNewFtr,
    'glbObsNewRsp': glbObsNewRsp,
        
    'sbtNewCorDf' : sbtNewCorDf    
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', glbPickleFile['data'], ':', e)
  raise
    
statinfo = os.stat(glbPickleFile['data'])
print('Compressed Data pickle size:', statinfo.st_size)    
('Compressed Data pickle size:', 1676071695)
In [17]:
import ggplot
In [27]:
gp = ggplot.ggplot(ggplot.components.aes(x = 'median', y = 'mean'), data = sbtNewCorDf) + \
        ggplot.geom_point(ggplot.components.aes(size = 'xRowsN'), color = 'blue') + \
        ggplot.geom_text(ggplot.components.aes(label = 'subject'))
print gp
<ggplot: (292594817)>
In [30]:
sbtNewCorDf = sbtNewCorDf.sort_values('mean', ascending = False)
sbtNewCorDf['xRowsN.cum'] = sbtNewCorDf['xRowsN'].cumsum()
sbtNewCorDf['xRowsN.cum.nTrn.Ratio'] = sbtNewCorDf['xRowsN.cum'] * 1.0 / \
                                        sbtNewCorDf['xRowsN'].sum()
print sbtNewCorDf
   chunkSize  duration       max      mean    median       min  xRowsN  \
0        500        84  0.936134  0.575738  0.578098 -0.032779    1226   
0        500        36  0.896713  0.573754  0.580933 -0.019778     346   
0        500        60  0.951831  0.571181  0.553923  0.038137     814   
0        500        51  0.911403  0.570485  0.574516  0.033160     724   
0        500        85  0.945552  0.567124  0.573081 -0.108175    1237   
0        500        62  0.893492  0.565394  0.556607  0.006713     820   
0        500        64  0.913507  0.561276  0.555566 -0.021651     809   
0        500        64  0.930973  0.558110  0.555655 -0.048058     823   
0        500        67  0.924473  0.556668  0.545573  0.002514    1034   
0        500        60  0.896334  0.556492  0.566102  0.008886     790   
0        500        72  0.887267  0.553708  0.556487 -0.067284    1078   
0        500        60  0.885953  0.542682  0.549579 -0.016990    1011   
0        500        53  0.897490  0.542406  0.552354 -0.056472     740   
0        500        81  0.925953  0.542128  0.541458 -0.148784    1196   
0        500        63  0.905781  0.538200  0.535851  0.012447     848   
0        500        84  0.926006  0.529561  0.529676 -0.040819    1233   
0        500        74  0.904321  0.527758  0.545097 -0.090799     920   
0        500        52  0.860108  0.521689  0.524201 -0.124219     725   
0        500        43  0.846570  0.519290  0.510401  0.023227     605   
0        500        46  0.833699  0.512349  0.520130 -0.069207     651   
0        500        60  0.894858  0.502463  0.528389 -0.122631     794   
0        500        42  0.866144  0.483161  0.503277 -0.111311     591   
0        500        62  0.889807  0.481822  0.479718 -0.193823     823   
0        500        68  0.843857  0.476219  0.476996 -0.205910     875   
0        500        69  0.891644  0.461138  0.455255 -0.209080     876   
0        500        65  0.882749  0.412253  0.418998 -0.133917     835   

   yRowsN subject  sbt  xRowsN.cum  xRowsN.cum.nTrn.Ratio  
0   79726    p024    0        1226               0.054674  
0   79726    p072    0        1572               0.070103  
0   79726    p075    0        2386               0.106404  
0   79726    p045    0        3110               0.138691  
0   79726    p021    0        4347               0.193855  
0   79726    p064    0        5167               0.230423  
0   79726    p061    0        5976               0.266500  
0   79726    p081    0        6799               0.303202  
0   79726    p066    0        7833               0.349313  
0   79726    p050    0        8623               0.384543  
0   79726    p016    0        9701               0.432617  
0   79726    p049    0       10712               0.477702  
0   79726    p052    0       11452               0.510703  
0   79726    p026    0       12648               0.564039  
0   79726    p035    0       13496               0.601855  
0   79726    p022    0       14729               0.656841  
0   79726    p051    0       15649               0.697868  
0   79726    p002    0       16374               0.730200  
0   79726    p041    0       16979               0.757180  
0   79726    p039    0       17630               0.786211  
0   79726    p056    0       18424               0.821620  
0   79726    p042    0       19015               0.847975  
0   79726    p012    0       19838               0.884677  
0   79726    p015    0       20713               0.923698  
0   79726    p014    0       21589               0.962763  
0   79726    p047    0       22424               1.000000  
In [41]:
drvVldSbt = sbtNewCorDf[sbtNewCorDf['xRowsN.cum.nTrn.Ratio'] < 0.2]
print '\ndrvVldSbt:'; print drvVldSbt
# print ~sbtNewCorDf.subject.isin(drvVldSbt['subject'])
drvFitSbt = sbtNewCorDf[~sbtNewCorDf.subject.isin(drvVldSbt['subject'])]
print '\ndrvFitSbt:'; print drvFitSbt
assert len(set(drvVldSbt['subject']).intersection(set(drvFitSbt['subject']))) == 0, \
    'drvVldSbt has elements in drvFitSbt'
drvVldSbt:
   chunkSize  duration       max      mean    median       min  xRowsN  \
0        500        84  0.936134  0.575738  0.578098 -0.032779    1226   
0        500        36  0.896713  0.573754  0.580933 -0.019778     346   
0        500        60  0.951831  0.571181  0.553923  0.038137     814   
0        500        51  0.911403  0.570485  0.574516  0.033160     724   
0        500        85  0.945552  0.567124  0.573081 -0.108175    1237   

   yRowsN subject  sbt  xRowsN.cum  xRowsN.cum.nTrn.Ratio  
0   79726    p024    0        1226               0.054674  
0   79726    p072    0        1572               0.070103  
0   79726    p075    0        2386               0.106404  
0   79726    p045    0        3110               0.138691  
0   79726    p021    0        4347               0.193855  

drvFitSbt:
   chunkSize  duration       max      mean    median       min  xRowsN  \
0        500        62  0.893492  0.565394  0.556607  0.006713     820   
0        500        64  0.913507  0.561276  0.555566 -0.021651     809   
0        500        64  0.930973  0.558110  0.555655 -0.048058     823   
0        500        67  0.924473  0.556668  0.545573  0.002514    1034   
0        500        60  0.896334  0.556492  0.566102  0.008886     790   
0        500        72  0.887267  0.553708  0.556487 -0.067284    1078   
0        500        60  0.885953  0.542682  0.549579 -0.016990    1011   
0        500        53  0.897490  0.542406  0.552354 -0.056472     740   
0        500        81  0.925953  0.542128  0.541458 -0.148784    1196   
0        500        63  0.905781  0.538200  0.535851  0.012447     848   
0        500        84  0.926006  0.529561  0.529676 -0.040819    1233   
0        500        74  0.904321  0.527758  0.545097 -0.090799     920   
0        500        52  0.860108  0.521689  0.524201 -0.124219     725   
0        500        43  0.846570  0.519290  0.510401  0.023227     605   
0        500        46  0.833699  0.512349  0.520130 -0.069207     651   
0        500        60  0.894858  0.502463  0.528389 -0.122631     794   
0        500        42  0.866144  0.483161  0.503277 -0.111311     591   
0        500        62  0.889807  0.481822  0.479718 -0.193823     823   
0        500        68  0.843857  0.476219  0.476996 -0.205910     875   
0        500        69  0.891644  0.461138  0.455255 -0.209080     876   
0        500        65  0.882749  0.412253  0.418998 -0.133917     835   

   yRowsN subject  sbt  xRowsN.cum  xRowsN.cum.nTrn.Ratio  
0   79726    p064    0        5167               0.230423  
0   79726    p061    0        5976               0.266500  
0   79726    p081    0        6799               0.303202  
0   79726    p066    0        7833               0.349313  
0   79726    p050    0        8623               0.384543  
0   79726    p016    0        9701               0.432617  
0   79726    p049    0       10712               0.477702  
0   79726    p052    0       11452               0.510703  
0   79726    p026    0       12648               0.564039  
0   79726    p035    0       13496               0.601855  
0   79726    p022    0       14729               0.656841  
0   79726    p051    0       15649               0.697868  
0   79726    p002    0       16374               0.730200  
0   79726    p041    0       16979               0.757180  
0   79726    p039    0       17630               0.786211  
0   79726    p056    0       18424               0.821620  
0   79726    p042    0       19015               0.847975  
0   79726    p012    0       19838               0.884677  
0   79726    p015    0       20713               0.923698  
0   79726    p014    0       21589               0.962763  
0   79726    p047    0       22424               1.000000  
In [ ]:
from sklearn.cross_validation import train_test_split
In [99]:
drvTrnSbt = driverDf.subject.unique()
drvTrnN = drvTrnSbt.shape[0]
drvVldN = int(drvTrnN * 0.2)

drvFitSbt, drvVldSbt = train_test_split(drvTrnSbt, 
                                        test_size = drvVldN, 
                                        random_state = glbObsShuffleSeed)
drvFitSbt.sort()
drvVldSbt.sort()
print 'Vld subjects obtained: %5d; vs. desired: %5d' % \
    (drvVldSbt.shape[0], drvVldN)
print type(drvVldSbt)
print drvVldSbt    
print 'Fit subjects obtained: %5d' % \
    (drvFitSbt.shape[0])
Vld subjects obtained:     5; vs. desired:     5
<type 'numpy.ndarray'>
['p012' 'p022' 'p047' 'p049' 'p056']
Fit subjects obtained:    21
In [92]:
print driverDf.subject[1000:1005]
# print (driverDf.subject.isin(drvVldSbt)).shape
print driverDf.subject.isin(drvVldSbt)[1000:1005]
# print driverDf[driverDf.subject.isin(drvVldSbt)]['img'][0:5]
print driverDf[driverDf.subject.isin(drvVldSbt)][1000:1005]
print driverDf[driverDf.subject.isin(drvVldSbt)]['img'][1000:1005]
print [glbObsTrnIdn[ix] for ix in xrange(len(glbObsTrnIdn))
       if glbObsTrnIdn[ix] in set(driverDf[driverDf.subject.isin(drvVldSbt)]['img'])
      ][0:5]
1000    p012
1001    p012
1002    p012
1003    p012
1004    p012
Name: subject, dtype: object
1000    True
1001    True
1002    True
1003    True
1004    True
Name: subject, dtype: bool
     subject classname            img
5791    p022        c1  img_74261.jpg
5792    p022        c1  img_96164.jpg
5793    p022        c1  img_71235.jpg
5794    p022        c1  img_89982.jpg
5795    p022        c1  img_16706.jpg
5791    img_74261.jpg
5792    img_96164.jpg
5793    img_71235.jpg
5794    img_89982.jpg
5795    img_16706.jpg
Name: img, dtype: object
['img_12780.jpg', 'img_89899.jpg', 'img_101194.jpg', 'img_84614.jpg', 'img_13214.jpg']
In [44]:
# obsTrnN = glbObsTrnFtr.shape[0] # or fixed number e.g. 20000
# obsVldN = int(obsTrnN * 0.2)
# print 'obsTrnN: %d; obsVldN: %d' % (obsTrnN, obsVldN)

tmpVldSbtImgSet = set(driverDf[driverDf.subject.isin(drvVldSbt.subject)]['img'])
# print tmpVldSbtImgSet
tmpObsVldIdx = [ix for ix in xrange(len(glbObsTrnIdn)) 
                if glbObsTrnIdn[ix] in tmpVldSbtImgSet]
glbObsVldIdn = [glbObsTrnIdn[ix] for ix in tmpObsVldIdx]
glbObsVldFtr = glbObsTrnFtr[tmpObsVldIdx,:,:]
glbObsVldRsp = glbObsTrnRsp[tmpObsVldIdx]

# glbObsFitIdn = glbObsTrnIdn[obsVldN:obsVldN+obsTrnN]
# glbObsFitFtr = glbObsTrnFtr[obsVldN:obsVldN+obsTrnN,:,:]
# glbObsFitRsp = glbObsTrnRsp[obsVldN:obsVldN+obsTrnN]

# print('   Fitting:', len(glbObsFitIdn), glbObsFitFtr.shape, glbObsFitRsp.shape)
print('Validation:', len(glbObsVldIdn), glbObsVldFtr.shape, glbObsVldRsp.shape)
print 'Validation / Trn Obs: %.4f' % (len(glbObsVldIdn) * 1.0 / len(glbObsTrnIdn))
('Validation:', 4347, (4347, 64, 64), (4347,))
Validation / Trn Obs: 0.1939
In [45]:
tmpFitSbtImgSet = set(driverDf[driverDf.subject.isin(drvFitSbt.subject)]['img'])
# print tmpVldSbtImgSet
tmpObsFitIdx = [ix for ix in xrange(len(glbObsTrnIdn)) 
                if glbObsTrnIdn[ix] in tmpFitSbtImgSet]
glbObsFitIdn = [glbObsTrnIdn[ix] for ix in tmpObsFitIdx]
glbObsFitFtr = glbObsTrnFtr[tmpObsFitIdx,:,:]
glbObsFitRsp = glbObsTrnRsp[tmpObsFitIdx]

# glbObsFitIdn = glbObsTrnIdn[obsVldN:obsVldN+obsTrnN]
# glbObsFitFtr = glbObsTrnFtr[obsVldN:obsVldN+obsTrnN,:,:]
# glbObsFitRsp = glbObsTrnRsp[obsVldN:obsVldN+obsTrnN]

print('   Fitting:', len(glbObsFitIdn), glbObsFitFtr.shape, glbObsFitRsp.shape)
print 'Fit / Trn Obs: %.4f' % (len(glbObsFitIdn) * 1.0 / len(glbObsTrnIdn))
('   Fitting:', 18077, (18077, 64, 64), (18077,))
Fit / Trn Obs: 0.8061
In [55]:
# obsTrnN = glbObsTrnFtr.shape[0]
# #obsTrnN = 200000
# obsVldN = 10000

# glbObsVldFtr = glbObsTrnFtr[:obsVldN,:,:]
# glbObsVldRsp = glbObsTrnRsp[:obsVldN]
# glbObsTrnFtr = glbObsTrnFtr[obsVldN:obsVldN+obsTrnN,:,:]
# glbObsTrnRsp = glbObsTrnRsp[obsVldN:obsVldN+obsTrnN]
# print('Training', glbObsTrnFtr.shape, glbObsTrnRsp.shape)
# print('Validation', glbObsVldFtr.shape, glbObsVldRsp.shape)
In [46]:
print 'glbObsVldRsp class knts & Trn ratios: '
print (np.unique(glbObsVldRsp, return_counts = True))
print (np.unique(glbObsVldRsp, return_counts = True)[1] * 1.0 / 
       np.unique(glbObsTrnRsp, return_counts = True)[1])
glbObsVldRsp class knts & Trn ratios: 
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32), array([484, 478, 452, 443, 459, 422, 443, 350, 364, 452]))
[ 0.1944556   0.21085135  0.19507984  0.18883205  0.19733448  0.18252595
  0.19053763  0.17482517  0.19047619  0.21230625]
In [54]:
try:
  f = open(glbPickleFile['data'], 'wb')
  save = {
#     'glbObsTrnIdn': glbObsTrnIdn,
#     'glbObsTrnFtr': glbObsTrnFtr,
#     'glbObsTrnRsp': glbObsTrnRsp,

    'glbObsFitIdn': glbObsFitIdn,
    'glbObsFitFtr': glbObsFitFtr,
    'glbObsFitRsp': glbObsFitRsp,

    'glbObsVldIdn': glbObsVldIdn,
    'glbObsVldFtr': glbObsVldFtr,
    'glbObsVldRsp': glbObsVldRsp,

    'glbObsNewIdn': glbObsNewIdn,
    'glbObsNewFtr': glbObsNewFtr,
    'glbObsNewRsp': glbObsNewRsp,
        
    'sbtNewCorDf' : sbtNewCorDf    
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', glbPickleFile['data'], ':', e)
  raise
    
statinfo = os.stat(glbPickleFile['data'])
print('Compressed Data pickle size:', statinfo.st_size)    
('Compressed Data pickle size:', 1676072621)

Check images in validation set to ensure that they are from the appropriate subjects

In [90]:
def lclDisplaySubjectSampleImages(smpSubjects, lclObsIdn, lclObsFtr, lclObsRsp):
#     print smpSubjects
    
    smpNImg = 3
    nRow = len(smpSubjects); nCol = smpNImg    
    figs, axes = plt.subplots(nRow, nCol, 
                              figsize=(6 * nCol, 6 * nRow))
    [(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) 
     for ax in axes.flatten()]
    for i, sbt in enumerate(smpSubjects):
        smpSbtDbImg = driverDf[driverDf.subject == sbt]
        # Select samples from different classes for each subject
        smpCls = [glbRspClass[clsIx] 
                  for clsIx in np.random.randint(0, glbRspClassN, smpNImg)]
#         print 'smpCls: '; print smpCls
        for j, cls in enumerate(smpCls):
            smpSbtClsDbImg = smpSbtDbImg[smpSbtDbImg.classname == cls]
#             print 'sbt: %s; cls: %s; smpSbtClsDbImg.shape:' % (sbt, cls)
#             print smpSbtClsDbImg.shape
#             print smpSbtClsDbImg.columns
#             print np.random.randint(0, smpSbtClsDbImg.shape[0], 1)[0]
#             print smpSbtClsDbImg.iloc[0]['img']
            
            ixIdn = lclObsIdn.index(smpSbtClsDbImg.iloc[
                    np.random.randint(0, smpSbtClsDbImg.shape[0], 1)[0]]['img'])
            if glbImg['color']:
                axes[i, j].imshow(lclObsFtr[ixIdn, :, :])
            else:    
                axes[i, j].imshow(lclObsFtr[ixIdn, :, :], cmap = 'gray')
#             axes[i, j].set_title(sbt + ':' + cls + 
#                                  ':' + smpSbtClsDbImg.iloc[0]['img'])
            axes[i, j].set_title(sbt + ':' + 
                                 glbRspClass[lclObsRsp[ixIdn]] + ':' + 
                                 lclObsIdn[ixIdn])
    plt.show()
    
lclDisplaySubjectSampleImages(drvVldSbt['subject'].unique(),
                              glbObsVldIdn, glbObsVldFtr, glbObsVldRsp)  

Corrections found here

In [91]:
plt.imshow(myreadImage(trnFoldersPth + '/c5/img_78504.jpg'))
Out[91]:
<matplotlib.image.AxesImage at 0x120894b90>
In [92]:
lclDisplaySubjectSampleImages(drvFitSbt['subject'].unique(),
                              glbObsFitIdn, glbObsFitFtr, glbObsFitRsp)  

Corrections found here

In [96]:
plt.imshow(myreadImage(trnFoldersPth + '/c9/img_16428.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_71047.jpg')) debatable
Out[96]:
<matplotlib.image.AxesImage at 0x117715890>

Inspect overlap

By construction, this dataset might contain a lot of overlapping samples, including training data that's also contained in the validation and test set! Overlap between training and test can skew the results if you expect to use your model in an environment where there is never an overlap, but are actually ok if you expect to see training samples recur when you use it. Measure how much overlap there is between training, validation and test samples.

Optional questions:

  • What about near duplicates between datasets? (images that are almost identical)
  • Create a sanitized validation and test set, and compare your accuracy on those in subsequent assignments.

In [99]:
# obsFitSet = set(img.tostring() for img in glbObsFitFtr)
# print 'Fit: shape: %s vs. len(set): %d pctDups: %0.4f' % \
#     (glbObsFitFtr.shape, len(obsFitSet), \
#      (glbObsFitFtr.shape[0] * 1.0 / len(obsFitSet) - 1) * 100)

obsTrnSet = set(img.tostring() for img in glbObsTrnFtr)
print 'Trn: shape: %s vs. len(set): %5d pctDups: %0.4f' % \
    (glbObsTrnFtr.shape, len(obsTrnSet), \
     (glbObsTrnFtr.shape[0] * 1.0 / len(obsTrnSet) - 1) * 100)

obsFitSet = set(img.tostring() for img in glbObsFitFtr)
print 'Fit: shape: %s vs. len(set): %5d pctDups: %0.4f' % \
    (glbObsFitFtr.shape, len(obsFitSet), \
     (glbObsFitFtr.shape[0] * 1.0 / len(obsFitSet) - 1) * 100)

obsVldSet = set(img.tostring() for img in glbObsVldFtr)
print 'Vld: shape: %s vs. len(set): %5d pctDups: %0.4f' % \
    (glbObsVldFtr.shape, len(obsVldSet), \
     (glbObsVldFtr.shape[0] * 1.0 / len(obsVldSet) - 1) * 100)

obsNewSet = set(img.tostring() for img in glbObsNewFtr)
print 'New: shape: %s vs. len(set): %5d pctDups: %0.4f' % \
    (glbObsNewFtr.shape, len(obsNewSet), \
     (glbObsNewFtr.shape[0] * 1.0 / len(obsNewSet) - 1) * 100) 
Trn: shape: (22424, 64, 64) vs. len(set): 22424 pctDups: 0.0000
Fit: shape: (18077, 64, 64) vs. len(set): 18077 pctDups: 0.0000
Vld: shape: (4347, 64, 64) vs. len(set):  4347 pctDups: 0.0000
New: shape: (79726, 64, 64) vs. len(set): 79724 pctDups: 0.0025
In [139]:
# print np.unique(glbObsNewFtr[:5], return_inverse = True)[1].shape
# print np.vstack({tuple(row) for row in glbObsNewFtr[:5]})
tmpObsNewFtr = glbObsNewFtr
# print 'tmpObsNewFtr.shape: %s' % (str(tmpObsNewFtr.shape))
rshObsNewFtr = np.reshape(tmpObsNewFtr, 
                          (tmpObsNewFtr.shape[0], 
                           tmpObsNewFtr.shape[1] * tmpObsNewFtr.shape[2]))
# print 'rshObsNewFtr.shape: %s' % (str(rshObsNewFtr.shape))                          
# print np.ascontiguousarray(tmpObsNewFtr).shape
conObsNewFtr = np.ascontiguousarray(rshObsNewFtr).view(np.dtype((np.void, 
                    rshObsNewFtr.dtype.itemsize * rshObsNewFtr.shape[1])))
# print conObsNewFtr.shape
# # print conObsNewFtr # This prints gibberish
_, idx = np.unique(conObsNewFtr, return_inverse=True)
# print idx.shape
# print idx
frqObsNewFtr = pd.Series(idx).value_counts()
# print frqObsNewFtr
# print type(frqObsNewFtr[frqObsNewFtr > 1])
print frqObsNewFtr[frqObsNewFtr > 1]
4        3
2047     1
17053    1
6806     1
4759     1
27288    1
25241    1
31386    1
29339    1
19100    1
23198    1
2708     1
21151    1
76464    1
74417    1
78515    1
68276    1
66229    1
72374    1
661      1
12947    1
43712    1
14994    1
45699    1
35460    1
33413    1
39558    1
37511    1
60040    1
57993    1
        ..
40313    1
34170    1
36219    1
46460    1
48509    1
42366    1
44415    1
71048    1
73097    1
66954    1
52595    1
56689    1
21856    1
54640    1
23905    1
17762    1
19811    1
30052    1
32101    1
25958    1
28007    1
5480     1
7529     1
1386     1
3435     1
13676    1
15725    1
9582     1
11631    1
0        1
dtype: int64
4    3
dtype: int64
In [142]:
dupsIx = np.where(idx == 4)
print dupsIx
(array([21164, 60293, 61990]),)
In [148]:
for obsIx in dupsIx[0]:
    print obsIx
    plt.imshow(myreadImage(newFoldersPth + '/' + glbObsNewIdn[obsIx]))
    plt.title('new:' + glbObsNewIdn[obsIx])
    plt.show()
21164
60293
61990
In [100]:
print 'Vld set overlap with Fit set: %0.4f' % \
    (len(obsVldSet.intersection(obsFitSet)) * 1.0 / len(obsVldSet))
print 'Vld set overlap with New set: %0.4f' % \
    (len(obsVldSet.intersection(obsNewSet)) * 1.0 / len(obsNewSet))
print 'Fit set overlap with New set: %0.4f' % \
    (len(obsFitSet.intersection(obsNewSet)) * 1.0 / len(obsFitSet))
Vld set overlap with Fit set: 0.0000
Vld set overlap with New set: 0.0000
Fit set overlap with New set: 0.0000
In [101]:
print mygetCorrObs(glbObsVldFtr, glbObsFitFtr, chunkSize = 1000)
   chunkSize  duration       max      mean    median      min  xRowsN  yRowsN
0       1000        52  0.931716  0.574565  0.571536 -0.00211    4347   18077
In [102]:
print mygetCorrObs(glbObsVldFtr, glbObsNewFtr, chunkSize = 1000)
  chunkSize:  1000; rowIx:  2000
  chunkSize:  1000; dffXIx:   347
  chunkSize:  1000; dffYIx:   726
  chunkSize: 1; xRowsN - dffXIx:  4000; yRowsN - dffYIx: 79000
  (at 188 secs) chunkSize: 1; rowIx:  4000
   chunkSize  duration       max      mean    median       min  xRowsN  yRowsN
0       1000       262  0.951831  0.571401  0.573735 -0.108175    4347   79726
In [12]:
print sbtNewCorDf[:5]['mean']
print sbtNewCorDf[:5]['xRowsN']
print np.average(sbtNewCorDf[:5]['mean'], 
                 weights = sbtNewCorDf[:5]['xRowsN'])
0    0.575738
0    0.573754
0    0.571181
0    0.570485
0    0.567124
Name: mean, dtype: float64
0    1226
0     346
0     814
0     724
0    1237
Name: xRowsN, dtype: int64
0.571400557397
In [13]:
print mygetCorrObs(glbObsFitFtr[:1000], glbObsNewFtr, chunkSize = 500)
   chunkSize  duration       max      mean    median      min  xRowsN  yRowsN
0        500        50  0.922835  0.522256  0.524711 -0.18704    1000   79726
In [14]:
print mygetCorrObs(glbObsFitFtr[:1000], glbObsNewFtr, chunkSize = 5000)
  (at    63 secs) chunkSize:  5000; dffYIx:  4726
  (at    63 secs) chunkSize: 1; xRowsN - dffXIx:     0; yRowsN - dffYIx: 75000
  (at 63 secs) chunkSize: 1; rowIx:     0
  (at 65 secs) chunkSize: 1; rowIx:     2
  (at 67 secs) chunkSize: 1; rowIx:     4
  (at 69 secs) chunkSize: 1; rowIx:     6
  (at 71 secs) chunkSize: 1; rowIx:     8
  (at 83 secs) chunkSize: 1; rowIx:    20
  (at 102 secs) chunkSize: 1; rowIx:    40
  (at 121 secs) chunkSize: 1; rowIx:    60
  (at 140 secs) chunkSize: 1; rowIx:    80
  (at 256 secs) chunkSize: 1; rowIx:   200
  (at 449 secs) chunkSize: 1; rowIx:   400
  (at 643 secs) chunkSize: 1; rowIx:   600
  (at 837 secs) chunkSize: 1; rowIx:   800
   chunkSize  duration       max      mean    median      min  xRowsN  yRowsN
0       5000      1035  0.922835  0.522256  0.524711 -0.18704    1000   79726
In [12]:
print mygetCorrObs(glbObsFitFtr[:1000], glbObsNewFtr, chunkSize = 1000)
   chunkSize  duration       max      mean    median      min  xRowsN  yRowsN
0       1000        42  0.922835  0.522256  0.524711 -0.18704    1000   79726
In [15]:
print mygetCorrObs(glbObsFitFtr[:2000], glbObsNewFtr, chunkSize = 1000)
  (at    77 secs) chunkSize:  1000; dffYIx:   726
  (at    78 secs) chunkSize: 1; xRowsN - dffXIx:  2000; yRowsN - dffYIx: 79000
   chunkSize  duration       max     mean    median      min  xRowsN  yRowsN
0       1000        86  0.922835  0.52249  0.525066 -0.18704    2000   79726
In [18]:
print mygetCorrObs(glbObsFitFtr[:2000], glbObsNewFtr, chunkSize = 3000)
  (at    76 secs) chunkSize:  3000; dffYIx:  1726
  (at    76 secs) chunkSize: 1; xRowsN - dffXIx:     0; yRowsN - dffYIx: 78000
  (at 76 secs) chunkSize: 1; rowIx:     0
  (at 76 secs) chunkSize: 1; rowIx:     2
  (at 77 secs) chunkSize: 1; rowIx:     4
  (at 78 secs) chunkSize: 1; rowIx:     6
  (at 78 secs) chunkSize: 1; rowIx:     8
  (at 83 secs) chunkSize: 1; rowIx:    20
  (at 90 secs) chunkSize: 1; rowIx:    40
  (at 97 secs) chunkSize: 1; rowIx:    60
  (at 104 secs) chunkSize: 1; rowIx:    80
  (at 146 secs) chunkSize: 1; rowIx:   200
  (at 217 secs) chunkSize: 1; rowIx:   400
  (at 288 secs) chunkSize: 1; rowIx:   600
  (at 358 secs) chunkSize: 1; rowIx:   800
   chunkSize  duration       max     mean    median      min  xRowsN  yRowsN
0       3000       789  0.922835  0.52249  0.525066 -0.18704    2000   79726
In [16]:
print mygetCorrObs(glbObsFitFtr[:2000], glbObsNewFtr, chunkSize = 2000)
  (at    70 secs) chunkSize:  2000; dffYIx:  1726
  (at    72 secs) chunkSize: 1; xRowsN - dffXIx:  2000; yRowsN - dffYIx: 78000
   chunkSize  duration       max     mean    median      min  xRowsN  yRowsN
0       2000        80  0.922835  0.52249  0.525066 -0.18704    2000   79726
In [19]:
print mygetCorrObs(glbObsFitFtr[:5000], glbObsNewFtr, chunkSize = 2000)
  (at    71 secs) chunkSize:  2000; rowIx:  2000
  (at   138 secs) chunkSize:  2000; dffXIx:  1000
  (at   178 secs) chunkSize:  2000; dffYIx:  1726
  (at   181 secs) chunkSize: 1; xRowsN - dffXIx:  4000; yRowsN - dffYIx: 78000
  (at 181 secs) chunkSize: 1; rowIx:  4000
   chunkSize  duration       max      mean    median       min  xRowsN  yRowsN
0       2000       552  0.922835  0.522854  0.525949 -0.208243    5000   79726
In [22]:
print mygetCorrObs(glbObsFitFtr[:5000], glbObsNewFtr, chunkSize = 500)
  (at   101 secs) chunkSize:   500; rowIx:  2000
  (at   194 secs) chunkSize:   500; rowIx:  4000
  (at   241 secs) chunkSize:   500; dffYIx:   226
  (at   242 secs) chunkSize: 1; xRowsN - dffXIx:  5000; yRowsN - dffYIx: 79500
   chunkSize  duration       max      mean    median       min  xRowsN  yRowsN
0        500       260  0.922835  0.522854  0.525949 -0.208243    5000   79726
In [21]:
print mygetCorrObs(glbObsFitFtr[:5000], glbObsNewFtr, chunkSize = 1000)
  (at    81 secs) chunkSize:  1000; rowIx:  2000
  (at   159 secs) chunkSize:  1000; rowIx:  4000
  (at   196 secs) chunkSize:  1000; dffYIx:   726
  (at   198 secs) chunkSize: 1; xRowsN - dffXIx:  5000; yRowsN - dffYIx: 79000
   chunkSize  duration       max      mean    median       min  xRowsN  yRowsN
0       1000       220  0.922835  0.522854  0.525949 -0.208243    5000   79726
In [23]:
print mygetCorrObs(glbObsFitFtr[:10000], glbObsNewFtr, chunkSize = 1000)
  (at    80 secs) chunkSize:  1000; rowIx:  2000
  (at   158 secs) chunkSize:  1000; rowIx:  4000
  (at   237 secs) chunkSize:  1000; rowIx:  6000
  (at   313 secs) chunkSize:  1000; rowIx:  8000
  (at   389 secs) chunkSize:  1000; dffYIx:   726
  (at   392 secs) chunkSize: 1; xRowsN - dffXIx: 10000; yRowsN - dffYIx: 79000
   chunkSize  duration       max      mean   median       min  xRowsN  yRowsN
0       1000      2720  0.926006  0.522942  0.52616 -0.208243   10000   79726
In [ ]:
# print mygetCorrObs(glbObsFitFtr, glbObsNewFtr, chunkSize = 1000)
In [6]:
with open(glbPickleFile['data'], 'rb') as f:
  save = pickle.load(f)

#   glbObsTrnIdn = save['glbObsTrnIdn']
#   glbObsTrnFtr = save['glbObsTrnFtr']
#   glbObsTrnRsp = save['glbObsTrnRsp']
    
  glbObsFitIdn = save['glbObsFitIdn']
  glbObsFitFtr = save['glbObsFitFtr']
  glbObsFitRsp = save['glbObsFitRsp']

  glbObsVldIdn = save['glbObsVldIdn']
  glbObsVldFtr = save['glbObsVldFtr']
  glbObsVldRsp = save['glbObsVldRsp']

  glbObsNewIdn = save['glbObsNewIdn']
  glbObsNewFtr = save['glbObsNewFtr']
  glbObsNewRsp = save['glbObsNewRsp']

  sbtNewCorDf = save['sbtNewCorDf']

  del save  # hint to help gc free up memory

  print('Fit set:', len(glbObsFitIdn), glbObsFitFtr.shape, 
                    glbObsFitRsp.shape)
  print('Vld set:', len(glbObsVldIdn), glbObsVldFtr.shape, 
                    glbObsVldRsp.shape)
  print('New set:', len(glbObsNewIdn), glbObsNewFtr.shape, 
                    glbObsNewRsp.shape)
  print '\nsbtNewCorDf:'; print (sbtNewCorDf.head())  
('Fit set:', 18077, (18077, 64, 64), (18077,))
('Vld set:', 4347, (4347, 64, 64), (4347,))
('New set:', 79726, (79726, 64, 64), (79726,))

sbtNewCorDf:
   chunkSize  duration       max      mean    median       min  xRowsN  \
0        500        84  0.936134  0.575738  0.578098 -0.032779    1226   
0        500        36  0.896713  0.573754  0.580933 -0.019778     346   
0        500        60  0.951831  0.571181  0.553923  0.038137     814   
0        500        51  0.911403  0.570485  0.574516  0.033160     724   
0        500        85  0.945552  0.567124  0.573081 -0.108175    1237   

   yRowsN subject  sbt  xRowsN.cum  xRowsN.cum.nTrn.Ratio  
0   79726    p024    0        1226               0.054674  
0   79726    p072    0        1572               0.070103  
0   79726    p075    0        2386               0.106404  
0   79726    p045    0        3110               0.138691  
0   79726    p021    0        4347               0.193855  
In [9]:
print mygetCorrObs(glbObsFitFtr[
        np.random.permutation(glbObsFitFtr.shape[0])[
            np.random.randint(0, glbObsFitFtr.shape[0], 100)
        ]], 
                   glbObsNewFtr, chunkSize = 1000)
   chunkSize  duration       max      mean    median       min  xRowsN  yRowsN
0       1000        29  0.917209  0.528387  0.531076 -0.101828     100   79726
In [10]:
print mygetCorrObs(glbObsFitFtr[
        np.random.permutation(glbObsFitFtr.shape[0])[
            np.random.randint(0, glbObsFitFtr.shape[0], 1000)
        ]], 
                   glbObsNewFtr, chunkSize = 1000)
   chunkSize  duration       max      mean    median       min  xRowsN  yRowsN
0       1000        42  0.915992  0.520396  0.523352 -0.193823    1000   79726
In [11]:
print mygetCorrObs(glbObsFitFtr[
        np.random.permutation(glbObsFitFtr.shape[0])[:10000]], 
                   glbObsNewFtr, chunkSize = 1000)
  (at    80 secs) chunkSize:  1000; rowIx:  2000
  (at   155 secs) chunkSize:  1000; rowIx:  4000
  (at   230 secs) chunkSize:  1000; rowIx:  6000
  (at   303 secs) chunkSize:  1000; rowIx:  8000
  (at   378 secs) chunkSize:  1000; dffYIx:   726
  (at   381 secs) chunkSize: 1; xRowsN - dffXIx: 10000; yRowsN - dffYIx: 79000
   chunkSize  duration       max      mean    median       min  xRowsN  yRowsN
0       1000      1078  0.924473  0.523323  0.526498 -0.208243   10000   79726
In [ ]:
print mygetCorrObs(glbObsFitFtr, 
                   glbObsNewFtr, chunkSize = 1000)

Finally, let's save the data for later reuse:
Remember to save previous pickled file as '_unshuffled'

In [75]:
# glbPickleFile = os.getcwd() + '/data/notMNIST.pickle'
# print glbPickleFile
In [138]:
# try:
#   f = open('data/' + glbPickleFile, 'wb')
#   save = {
#     'glbObsTrnIdn': glbObsTrnIdn,
#     'glbObsTrnFtr': glbObsTrnFtr,
#     'glbObsTrnRsp': glbObsTrnRsp,
        
#     'glbObsFitIdn': glbObsFitIdn,        
#     'glbObsFitFtr': glbObsFitFtr,
#     'glbObsFitRsp': glbObsFitRsp,
        
#     'glbObsVldIdn': glbObsVldIdn,        
#     'glbObsVldFtr': glbObsVldFtr,
#     'glbObsVldRsp': glbObsVldRsp,
        
#     'glbObsNewIdn': glbObsNewIdn,        
#     'glbObsNewFtr': glbObsNewFtr,
#     'glbObsNewRsp': glbObsNewRsp,
        
#     'sbtNewCorDf' : sbtNewCorDf            
#     }
#   pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
#   f.close()
# except Exception as e:
#   print('Unable to save data to', glbPickleFile, ':', e)
#   raise
    
# statinfo = os.stat('data/' + glbPickleFile)
# print('Compressed pickle size:', statinfo.st_size)       
('Compressed pickle size:', 512899134)
In [ ]:
 
In [94]:
# print glbObsTrnFtr[0:3]
# print np.ascontiguousarray(glbObsTrnFtr[0:3])
# print np.ascontiguousarray(glbObsTrnFtr[0:3]).shape
In [139]:
# obsFitSet = set(img.tostring() for img in glbObsFitFtr)
# print 'Fit: shape: %s vs. len(set): %d pctDups: %0.4f' % \
#     (glbObsFitFtr.shape, len(obsFitSet), \
#      (glbObsFitFtr.shape[0] * 1.0 / len(obsFitSet) - 1) * 100)

# obsVldSet = set(img.tostring() for img in glbObsVldFtr)
# print 'Vld: shape: %s vs. len(set): %d pctDups: %0.4f' % \
#     (glbObsVldFtr.shape, len(obsVldSet), \
#      (glbObsVldFtr.shape[0] * 1.0 / len(obsVldSet) - 1) * 100)

# obsNewSet = set(img.tostring() for img in glbObsNewFtr)
# print 'New: shape: %s vs. len(set): %d pctDups: %0.4f' % \
#     (glbObsNewFtr.shape, len(obsNewSet), \
#      (glbObsNewFtr.shape[0] * 1.0 / len(obsNewSet) - 1) * 100) 
Fit: shape: (17940, 32, 32) vs. len(set): 17940 pctDups: 0.0000
Vld: shape: (4484, 32, 32) vs. len(set): 4484 pctDups: 0.0000
New: shape: (79726, 32, 32) vs. len(set): 79724 pctDups: 0.0025
In [79]:
#print glbObsTrnFtr[0:3]
# obsFitSet = set(img.tostring() for img in glbObsTrnFtr)
# print 'train: shape: %s vs. len(set): %d pctDups: %0.4f' % \
#     (glbObsTrnFtr.shape, len(obsFitSet), \
#      (glbObsTrnFtr.shape[0] * 1.0 / len(obsFitSet) - 1) * 100)

# validSet = set(img.tostring() for img in glbObsVldFtr)
# print 'valid: shape: %s vs. len(set): %d pctDups: %0.4f' % \
#     (glbObsVldFtr.shape, len(validSet), \
#      (glbObsVldFtr.shape[0] * 1.0 / len(validSet) - 1) * 100)

# obsNewSet = set(img.tostring() for img in glbObsNewFtr)
# print 'test : shape: %s vs. len(set): %d pctDups: %0.4f' % \
#     (glbObsNewFtr.shape, len(obsNewSet), \
#      (glbObsNewFtr.shape[0] * 1.0 / len(obsNewSet) - 1) * 100)    
In [142]:
# print 'Vld set overlap with Fit set: %0.4f' % \
#     (len(obsVldSet.intersection(obsFitSet)) * 1.0 / len(obsVldSet))
# print 'Vld set overlap with New set: %0.4f' % \
#     (len(obsVldSet.intersection(obsNewSet)) * 1.0 / len(obsNewSet))
# print 'Fit set overlap with New set: %0.4f' % \
#     (len(obsFitSet.intersection(obsNewSet)) * 1.0 / len(obsFitSet))
# print ' test set overlap with train set: %0.4f' % \
#     (len( obsNewSet.intersection(obsFitSet)) * 1.0 / len( obsNewSet))    
# print 'valid set overlap with  test set: %0.4f' % \
#     (len(validSet.intersection( obsNewSet)) * 1.0 / len(validSet))
Vld set overlap with Fit set: 0.0000
Vld set overlap with New set: 0.0000
Fit set overlap with New set: 0.0000

Stop here!

Following code is in img_02_fit_lgtRgr_SFDD

Let's get an idea of what an off-the-shelf classifier can give you on this data. It's always good to check that there is something to learn, and that it's a problem that is not so trivial that a canned solution solves it.

Train a simple model on this data using 50, 100, 1000 and 5000 training samples. Hint: you can use the LogisticRegression model from sklearn.linear_model.

Optional question: train an off-the-shelf model on all the data!


In [110]:
# import graphlab
# print graphlab.version
# graphlab.canvas.set_target('ipynb')
1.8.1
In [ ]:
# graphlab.logistic_classifier.create(image_train,target='label',
#                                               features=['image_array'])
In [113]:
print glbObsTrnFtr[0:3,:,:]
print np.reshape(glbObsTrnFtr[0:3,:,:], (3, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2]))
print np.reshape(glbObsTrnFtr[0:3,:,:], (3, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])).shape
[[[-0.5        -0.5        -0.5        ..., -0.49215686 -0.49607843 -0.5       ]
  [-0.5        -0.5        -0.5        ..., -0.44901961 -0.5        -0.49607843]
  [-0.5        -0.5        -0.5        ...,  0.29215688 -0.41764706 -0.5       ]
  ..., 
  [-0.5        -0.5        -0.5        ..., -0.49607843 -0.49607843
   -0.49607843]
  [-0.19019608  0.11176471  0.37450981 ..., -0.48823529 -0.49607843 -0.5       ]
  [ 0.24901961  0.34705883  0.19411765 ..., -0.49607843 -0.5        -0.5       ]]

 [[-0.5        -0.5        -0.5        ...,  0.5         0.5         0.5       ]
  [-0.5        -0.5        -0.5        ...,  0.5         0.5         0.5       ]
  [-0.5        -0.5        -0.5        ...,  0.5         0.5         0.5       ]
  ..., 
  [-0.43725491  0.04901961  0.38627452 ...,  0.45294118  0.22941177
   -0.30000001]
  [-0.5        -0.5        -0.3392157  ..., -0.20196079 -0.45686275 -0.5       ]
  [-0.49607843 -0.49215686 -0.49607843 ..., -0.5        -0.5        -0.49215686]]

 [[-0.5        -0.49607843 -0.43725491 ..., -0.5        -0.49607843 -0.5       ]
  [-0.40980393  0.11960784  0.42941177 ..., -0.24901961 -0.5        -0.49607843]
  [-0.03333334  0.5         0.48431373 ...,  0.41764706 -0.4254902  -0.5       ]
  ..., 
  [-0.5        -0.48039216 -0.06078431 ..., -0.5        -0.5        -0.5       ]
  [-0.5        -0.36666667  0.5        ..., -0.5        -0.5        -0.5       ]
  [-0.5        -0.39803922  0.28823531 ..., -0.5        -0.5        -0.5       ]]]
[[-0.5        -0.5        -0.5        ..., -0.49607843 -0.5        -0.5       ]
 [-0.5        -0.5        -0.5        ..., -0.5        -0.5        -0.49215686]
 [-0.5        -0.49607843 -0.43725491 ..., -0.5        -0.5        -0.5       ]]
(3, 784)
In [134]:
from sklearn import metrics, linear_model
import pandas as pd
In [171]:
def fitMdl(nFitObs = 50):
    mdl = linear_model.LogisticRegression(verbose = 1)
    mdl.fit(np.reshape(glbObsTrnFtr[0:nFitObs,:,:], \
                            (nFitObs, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])), \
                 glbObsTrnRsp[0:nFitObs])
    print mdl.get_params()
    print mdl.coef_.shape
    print '  coeff stats:'
    for lblIx in xrange(len(dspLabels)):
        print '  label:%s; minCoeff:row:%2d, col:%2d, value:%0.4f; maxCoeff:row:%2d, col:%2d, value:%0.4f;' % \
            (dspLabels[lblIx], \
             mdl.coef_[lblIx,:].argmin() / glbImg['size'], \
             mdl.coef_[lblIx,:].argmin() % glbImg['size'], \
             mdl.coef_[lblIx,:].min(), \
             mdl.coef_[lblIx,:].argmax() / glbImg['size'], \
             mdl.coef_[lblIx,:].argmax() % glbImg['size'], \
             mdl.coef_[lblIx,:].max())

    train_pred_labels = mdl.predict(np.reshape(glbObsTrnFtr[0:nFitObs,:,:], \
                                                    (nFitObs               , glbImg['size'] ** 2)))
    accuracy_train = metrics.accuracy_score(train_pred_labels, glbObsTrnRsp[0:nFitObs])
    print '  accuracy train:%0.4f' % (accuracy_train)
    print metrics.confusion_matrix(glbObsTrnRsp[0:nFitObs], train_pred_labels)

    valid_pred_labels = mdl.predict(np.reshape(glbObsVldFtr, \
                                                    (glbObsVldFtr.shape[0], glbImg['size'] ** 2)))
    accuracy_valid = metrics.accuracy_score(valid_pred_labels, glbObsVldRsp)
    print '  accuracy valid:%0.4f' % (accuracy_valid)
    print metrics.confusion_matrix(glbObsVldRsp           , valid_pred_labels)

    test_pred_labels  = mdl.predict(np.reshape(glbObsNewFtr, \
                                                    (glbObsNewFtr.shape[0], glbImg['size'] ** 2)))
    accuracy_test = metrics.accuracy_score( test_pred_labels,  glbObsNewRsp)
    print '  accuracy  test:%0.4f' % (accuracy_test)
    test_conf = pd.DataFrame(metrics.confusion_matrix( glbObsNewRsp,  test_pred_labels), \
                             index = dspLabels, columns = dspLabels)
    print test_conf
    
    return(mdl, (accuracy_train, accuracy_valid, accuracy_test))
In [172]:
mdl50 = fitMdl(nFitObs = 50) 
[LibLinear]{'warm_start': False, 'C': 1.0, 'n_jobs': 1, 'verbose': 1, 'intercept_scaling': 1, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'multi_class': 'ovr', 'random_state': None, 'dual': False, 'tol': 0.0001, 'solver': 'liblinear', 'class_weight': None}
(10, 784)
  coeff stats:
  label:A; minCoeff:row:26, col: 8, value:-0.2571; maxCoeff:row:24, col:25, value:0.1487;
  label:B; minCoeff:row: 2, col:20, value:-0.2250; maxCoeff:row:16, col:23, value:0.2356;
  label:C; minCoeff:row:26, col: 4, value:-0.2084; maxCoeff:row:25, col:26, value:0.2056;
  label:D; minCoeff:row:25, col: 7, value:-0.1682; maxCoeff:row: 9, col:25, value:0.1925;
  label:E; minCoeff:row: 1, col:19, value:-0.1914; maxCoeff:row:25, col:27, value:0.2057;
  label:F; minCoeff:row: 1, col:19, value:-0.1759; maxCoeff:row: 2, col: 1, value:0.2158;
  label:G; minCoeff:row: 1, col:19, value:-0.2289; maxCoeff:row:11, col: 0, value:0.1832;
  label:H; minCoeff:row:26, col: 9, value:-0.2210; maxCoeff:row:27, col:27, value:0.1907;
  label:I; minCoeff:row: 0, col:14, value:-0.1343; maxCoeff:row:27, col:27, value:0.2123;
  label:J; minCoeff:row:13, col: 9, value:-0.1960; maxCoeff:row: 0, col:21, value:0.1679;
  accuracy train:1.0000
[[5 0 0 0 0 0 0 0 0 0]
 [0 6 0 0 0 0 0 0 0 0]
 [0 0 4 0 0 0 0 0 0 0]
 [0 0 0 4 0 0 0 0 0 0]
 [0 0 0 0 6 0 0 0 0 0]
 [0 0 0 0 0 4 0 0 0 0]
 [0 0 0 0 0 0 6 0 0 0]
 [0 0 0 0 0 0 0 2 0 0]
 [0 0 0 0 0 0 0 0 4 0]
 [0 0 0 0 0 0 0 0 0 9]]
  accuracy valid:0.5822
[[682  27   6  28  28  18  33  59  31 121]
 [ 24 671  18  48  33  21  73  14  63  49]
 [ 37  39 574  35 161   2 102   1  28  29]
 [ 24  53  16 698  14  21  60   5  31  63]
 [ 51 215 118  13 377   9  36   8  84  45]
 [ 55 173  17  18 168 437  21   4  16  53]
 [ 43  46 216  37  60  20 513   3  20  80]
 [ 79 101   8  30  90 160  37 385  50  35]
 [ 47  20  38  10  60   7  32   7 625 188]
 [ 26  11   9  16  17  16  26   1  13 860]]
  accuracy  test:0.6381
      A     B     C     D    E    F     G    H     I     J
A  1283    43     9    34   33   40    42  103    34   251
B    38  1448    19    65   41   23    73   16    77    73
C    27    65  1241    53  271    5   137    1    44    29
D    24    92    32  1474   25   30    68    5    43    80
E    46   495   210    14  818   15    43   13   175    44
F    93   371    16    22  302  919    25    9    29    86
G    67    61   417    60   76   16  1015    6    37   117
H   144   182    18    39  206  241    44  793   154    51
I    33    21    36    20  122   16    26   10  1223   365
J    19     9    13    22   21   15    16    1    22  1734
In [181]:
models = pd.DataFrame({'nFitObs': [1e2, 1e3, 1e4, 1e5, glbObsTrnFtr.shape[0]]})
models = models.set_index(models['nFitObs'])
models['mdl'] = linear_model.LogisticRegression()
models['accuracy.fit'] = -1; models['accuracy.vld'] = -1; models['accuracy.new'] = -1

for thsN in models['nFitObs']: 
    models.ix[thsN, 'mdl'], (models.ix[thsN, 'accuracy.fit'], \
                             models.ix[thsN, 'accuracy.vld'], \
                             models.ix[thsN, 'accuracy.new'], \
                            ) = fitMdl(nFitObs = thsN)
    
print models
[LibLinear]{'warm_start': False, 'C': 1.0, 'n_jobs': 1, 'verbose': 1, 'intercept_scaling': 1, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'multi_class': 'ovr', 'random_state': None, 'dual': False, 'tol': 0.0001, 'solver': 'liblinear', 'class_weight': None}
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:3: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  app.launch_new_instance()
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:10: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
(10, 784)
  coeff stats:
  label:A; minCoeff:row:26, col: 8, value:-0.3014; maxCoeff:row:17, col:17, value:0.2229;
  label:B; minCoeff:row: 1, col:19, value:-0.2240; maxCoeff:row:16, col:23, value:0.3035;
  label:C; minCoeff:row:26, col: 8, value:-0.2396; maxCoeff:row:25, col:15, value:0.1714;
  label:D; minCoeff:row:14, col:19, value:-0.2523; maxCoeff:row:27, col: 1, value:0.2116;
  label:E; minCoeff:row: 9, col:19, value:-0.2736; maxCoeff:row:11, col:11, value:0.2807;
  label:F; minCoeff:row:26, col:19, value:-0.3569; maxCoeff:row: 2, col: 2, value:0.2562;
  label:G; minCoeff:row: 1, col:19, value:-0.2610; maxCoeff:row:18, col:27, value:0.2457;
  label:H; minCoeff:row:26, col:10, value:-0.2259; maxCoeff:row: 0, col:27, value:0.1947;
  label:I; minCoeff:row:15, col:18, value:-0.2584; maxCoeff:row:27, col:27, value:0.2571;
  label:J; minCoeff:row:24, col: 5, value:-0.2323; maxCoeff:row: 0, col:27, value:0.2230;
  accuracy train:1.0000
[[11  0  0  0  0  0  0  0  0  0]
 [ 0  9  0  0  0  0  0  0  0  0]
 [ 0  0  9  0  0  0  0  0  0  0]
 [ 0  0  0  6  0  0  0  0  0  0]
 [ 0  0  0  0 16  0  0  0  0  0]
 [ 0  0  0  0  0  9  0  0  0  0]
 [ 0  0  0  0  0  0 12  0  0  0]
 [ 0  0  0  0  0  0  0  6  0  0]
 [ 0  0  0  0  0  0  0  0 10  0]
 [ 0  0  0  0  0  0  0  0  0 12]]
  accuracy valid:0.6829
[[728  26  10  17  45  13  45  39  22  88]
 [ 23 692  14  42  67  24  49  20  51  32]
 [ 16  22 699  13 128   6  70   6  20  28]
 [ 16  54  20 694  32  23  38  22  27  59]
 [ 18  24  78   4 606  52  25  36  76  37]
 [ 13  13  17   2 161 671  16   9  18  42]
 [ 20  28  93  17  53  21 733   5  20  48]
 [ 57  44  10  23 118 115  34 516  30  28]
 [ 30  10  31   7  85  24  27  13 651 156]
 [ 18   7  13  16  27  23  27   4  21 839]]
  accuracy  test:0.7498
      A     B     C     D     E     F     G     H     I     J
A  1382    33     7    22    46    41    61    50    26   204
B    31  1485    18    44    84    28    41    14    65    63
C     4    38  1484    15   152     6   119     6    28    21
D    17    72    28  1502    35    38    47    14    36    84
E    27    92   153     7  1199   115    24    62   166    28
F    23     3    13     7   247  1433    15    36    26    69
G    26    41   152    11    71    21  1462     8    21    59
H    97    48    20    33   238   175    42  1092    81    46
I    29    22    25    15   105    36    18    16  1310   296
J     7    12    18    26    23    36    16     8    36  1690
[LibLinear]{'warm_start': False, 'C': 1.0, 'n_jobs': 1, 'verbose': 1, 'intercept_scaling': 1, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'multi_class': 'ovr', 'random_state': None, 'dual': False, 'tol': 0.0001, 'solver': 'liblinear', 'class_weight': None}
(10, 784)
  coeff stats:
  label:A; minCoeff:row: 4, col: 7, value:-0.7170; maxCoeff:row:27, col:27, value:0.6456;
  label:B; minCoeff:row: 0, col:26, value:-1.0239; maxCoeff:row:18, col:27, value:0.6353;
  label:C; minCoeff:row:15, col:16, value:-0.6154; maxCoeff:row: 7, col:26, value:0.6948;
  label:D; minCoeff:row: 1, col:27, value:-0.6937; maxCoeff:row:13, col:26, value:0.6355;
  label:E; minCoeff:row:17, col:26, value:-1.0853; maxCoeff:row:14, col:21, value:0.7244;
  label:F; minCoeff:row:11, col:13, value:-0.4294; maxCoeff:row: 2, col: 1, value:0.4680;
  label:G; minCoeff:row:12, col:18, value:-0.7411; maxCoeff:row:15, col:14, value:0.6454;
  label:H; minCoeff:row: 0, col:15, value:-0.7681; maxCoeff:row: 0, col:27, value:1.0599;
  label:I; minCoeff:row:23, col:18, value:-0.7867; maxCoeff:row:24, col: 2, value:0.7645;
  label:J; minCoeff:row:27, col: 7, value:-0.7077; maxCoeff:row: 0, col:27, value:0.9120;
  accuracy train:0.9950
[[110   0   0   0   0   0   0   0   0   0]
 [  1 106   0   0   0   0   0   0   0   0]
 [  0   0  99   0   0   0   0   0   0   0]
 [  0   0   0  91   0   0   0   0   0   0]
 [  0   0   0   0 102   0   0   0   0   0]
 [  0   0   0   0   0  83   0   0   0   0]
 [  0   0   0   0   0   0 102   0   0   0]
 [  0   0   0   0   0   0   0 101   0   0]
 [  0   0   0   0   0   0   1   1  96   1]
 [  0   0   0   0   0   0   0   0   1 105]]
  accuracy valid:0.7580
[[797  15  15  25  19  15  20  50  32  45]
 [ 28 740  19  60  32  17  31  19  38  30]
 [ 28  17 779  12  48  10  51  10  25  28]
 [ 27  33  11 768  16  17  32  10  31  40]
 [ 24  35  62  11 642  23  35  35  57  32]
 [ 19  19  16  12  53 751  18  10  29  35]
 [ 26  29  54  22  37  19 774  12  22  43]
 [ 36  24  15  25  32  29  24 734  25  31]
 [ 23  14  14  20  34  20  30  21 769  89]
 [ 26   5  13  22  15  24  22  13  29 826]]
  accuracy  test:0.8342
      A     B     C     D     E     F     G     H     I     J
A  1543     8    13    22    20    24    41    94    33    74
B    27  1550    18    57    57    21    42    21    42    38
C    23    23  1627    12    63    10    58     7    22    28
D    21    35    25  1627    25    15    30    11    41    43
E    27   108   104     9  1401    26    28    56    84    30
F    23    19    15    21    53  1614    19    14    38    56
G    34    19   104    22    48    17  1513    23    28    64
H    57    36    19    26    49    23    31  1541    45    45
I    36    13    14    17    43    22    30    19  1520   158
J    24     7    12    23    11    33    25     7    46  1684
[LibLinear]{'warm_start': False, 'C': 1.0, 'n_jobs': 1, 'verbose': 1, 'intercept_scaling': 1, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'multi_class': 'ovr', 'random_state': None, 'dual': False, 'tol': 0.0001, 'solver': 'liblinear', 'class_weight': None}
(10, 784)
  coeff stats:
  label:A; minCoeff:row:23, col:13, value:-1.6050; maxCoeff:row:27, col:27, value:1.5933;
  label:B; minCoeff:row:14, col:27, value:-1.6509; maxCoeff:row:10, col:18, value:1.4947;
  label:C; minCoeff:row: 9, col:16, value:-1.4944; maxCoeff:row: 6, col:10, value:1.3965;
  label:D; minCoeff:row: 0, col:27, value:-1.3080; maxCoeff:row:16, col:21, value:1.3399;
  label:E; minCoeff:row:19, col: 6, value:-1.3329; maxCoeff:row: 8, col: 8, value:1.4913;
  label:F; minCoeff:row:12, col:23, value:-1.5250; maxCoeff:row:19, col: 2, value:1.3375;
  label:G; minCoeff:row:13, col: 5, value:-1.4522; maxCoeff:row:17, col:15, value:1.6977;
  label:H; minCoeff:row: 0, col:15, value:-1.7416; maxCoeff:row:16, col:15, value:1.5178;
  label:I; minCoeff:row:26, col:17, value:-1.2247; maxCoeff:row:23, col:10, value:1.4428;
  label:J; minCoeff:row: 3, col: 4, value:-1.2762; maxCoeff:row:17, col:10, value:1.3836;
  accuracy train:0.8983
[[939   9   9  10   9   7  12  27  15  13]
 [  5 889   3  26  18   5  12  13  17   5]
 [  3   7 922   3  13   2  10   6  14   5]
 [  9   9   3 906   8   8  13  10  11   5]
 [ 10  17  30   8 833  12  19  11  35   5]
 [  8   1   3   9   5 902  13   7  14   6]
 [  9   5  27  18   9  11 887  17  23  13]
 [ 19   6   2   7   8   7  18 892  24   7]
 [ 14   8   7  20  10  20  12  20 887  40]
 [  5   5   3   5   2  13   1   9  26 926]]
  accuracy valid:0.7892
[[829   9  16  19  24  13  23  45  33  22]
 [ 20 758  18  42  31  25  35  28  38  19]
 [ 16  20 827  14  29  16  35  18  23  10]
 [ 21  28   8 791  11  23  21  20  27  35]
 [ 19  28  73   9 682  29  29  29  44  14]
 [ 16  13  25  13  23 786  19  19  34  14]
 [ 15  10  60  25  22  25 822  17  21  21]
 [ 40  11  16  20  20  32  23 771  28  14]
 [ 25  12  19  18  22  19  17  36 814  52]
 [ 15  11  14  24  10  29  18  19  43 812]]
  accuracy  test:0.8602
      A     B     C     D     E     F     G     H     I     J
A  1617    20    19    15    26    15    27    57    24    52
B    20  1583    13    56    54    28    35    24    32    28
C     7    15  1692     9    53    13    47     5    20    12
D    26    49    18  1637    16    17    21    13    34    42
E    27    62    95     9  1502    38    28    29    61    22
F    22    21    24    11    30  1660    17    12    46    29
G    28    23    78    27    35    28  1577    23    20    33
H    68    20    19    19    35    29    24  1593    33    32
I    25    11    17    26    39    31    28    35  1572    88
J    22     7    18    17     7    33    21     7    66  1674
[LibLinear]{'warm_start': False, 'C': 1.0, 'n_jobs': 1, 'verbose': 1, 'intercept_scaling': 1, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'multi_class': 'ovr', 'random_state': None, 'dual': False, 'tol': 0.0001, 'solver': 'liblinear', 'class_weight': None}
(10, 784)
  coeff stats:
  label:A; minCoeff:row:27, col:10, value:-1.1414; maxCoeff:row: 6, col: 2, value:1.1745;
  label:B; minCoeff:row: 0, col:26, value:-1.7742; maxCoeff:row:20, col:27, value:1.0678;
  label:C; minCoeff:row: 1, col: 3, value:-0.9762; maxCoeff:row: 1, col: 4, value:1.0519;
  label:D; minCoeff:row: 0, col:26, value:-1.4436; maxCoeff:row:14, col: 9, value:0.8546;
  label:E; minCoeff:row:20, col:27, value:-0.9845; maxCoeff:row: 8, col: 0, value:1.8054;
  label:F; minCoeff:row:23, col:26, value:-1.1607; maxCoeff:row: 1, col:27, value:0.7309;
  label:G; minCoeff:row:13, col:12, value:-1.0996; maxCoeff:row:19, col:17, value:0.9612;
  label:H; minCoeff:row: 0, col:14, value:-1.1564; maxCoeff:row: 0, col:27, value:0.7920;
  label:I; minCoeff:row:15, col: 2, value:-0.7929; maxCoeff:row: 9, col: 2, value:0.9165;
  label:J; minCoeff:row:13, col: 0, value:-1.0014; maxCoeff:row: 0, col:26, value:1.4257;
  accuracy train:0.8344
[[8411  135  112  133  105  133  171  380  236  237]
 [ 147 8005  104  406  236  141  222  174  242  122]
 [  77  102 8744  117  203  107  252  116  199   84]
 [ 151  233   99 8443   86  158  150  161  220  142]
 [ 144  195  549  119 7692  299  283  177  444  139]
 [ 100   62  107   93  109 8753  156  125  260  199]
 [ 168  136  401  145  147  195 8363  125  267  209]
 [ 317  121   95  154  159  159  171 8556  304  148]
 [ 184  129  147  164  171  220  194  274 7897  597]
 [ 131   90   87  157   71  192  151  123  405 8575]]
  accuracy valid:0.8206
[[857  16  12  14  12  10  19  38  29  26]
 [ 17 787  12  53  33  26  25  20  26  15]
 [  7  10 856  12  30   9  35  16  23  10]
 [ 14  23   6 829   9  16  26  16  25  21]
 [ 11  25  62   9 723  24  28  25  36  13]
 [ 12   9  15  11  14 824  19  11  32  15]
 [ 11  12  46  18  15  30 848  11  25  22]
 [ 32  10   7  14  20  16  20 814  27  15]
 [ 19   9  21  11  13  19  23  38 831  50]
 [ 13   6   9  23  10  31  11  15  40 837]]
  accuracy  test:0.8891
      A     B     C     D     E     F     G     H     I     J
A  1659    12    14     8    14     8    29    65    23    40
B    11  1643    11    62    33    25    29    18    28    13
C     6     8  1742     4    27    22    27     7    23     7
D    16    28     9  1718    12    18     8    16    24    24
E    13    47    74    13  1568    35    32    14    65    12
F    16     7    16     7    10  1722    19     5    34    36
G    18    16    67    17    16    42  1636    17    27    16
H    50    20    11    12    28    27    21  1651    39    13
I    27    10    12    18    28    30    25    24  1603    95
J    16     2     7    18    10    36    15    10    52  1706
[LibLinear]{'warm_start': False, 'C': 1.0, 'n_jobs': 1, 'verbose': 1, 'intercept_scaling': 1, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'multi_class': 'ovr', 'random_state': None, 'dual': False, 'tol': 0.0001, 'solver': 'liblinear', 'class_weight': None}
(10, 784)
  coeff stats:
  label:A; minCoeff:row:27, col:12, value:-1.0494; maxCoeff:row:27, col:27, value:1.1473;
  label:B; minCoeff:row: 0, col:26, value:-1.8980; maxCoeff:row:20, col:27, value:0.8489;
  label:C; minCoeff:row:27, col: 1, value:-0.8797; maxCoeff:row:13, col: 0, value:0.7399;
  label:D; minCoeff:row: 0, col:27, value:-1.1843; maxCoeff:row:27, col: 5, value:0.7192;
  label:E; minCoeff:row: 7, col: 0, value:-0.8467; maxCoeff:row: 8, col: 0, value:1.0877;
  label:F; minCoeff:row:27, col:27, value:-0.8682; maxCoeff:row: 8, col:27, value:0.8271;
  label:G; minCoeff:row:27, col: 0, value:-0.8403; maxCoeff:row:15, col:27, value:0.7896;
  label:H; minCoeff:row: 0, col:13, value:-1.1541; maxCoeff:row: 0, col:27, value:0.8937;
  label:I; minCoeff:row:26, col: 3, value:-0.6741; maxCoeff:row:20, col: 0, value:0.8260;
  label:J; minCoeff:row:27, col: 1, value:-0.9333; maxCoeff:row: 0, col:26, value:1.4196;
  accuracy train:0.8295
[[43020   771   559   789   559   637   933  2049  1217  1342]
 [  746 41687   527  2334  1263   909  1196   958  1432   845]
 [  375   602 45251   469  1162   515  1458   509  1130   433]
 [  729  1281   474 44419   427   904   772   792  1153   975]
 [  630  1114  2918   652 39753  1624  1319   984  2316   646]
 [  565   385   601   486   520 45425   869   661  1422  1016]
 [  895   836  2280   768   740  1017 42364   672  1282  1020]
 [ 1577   749   436   820   769   875   838 43527  1546   800]
 [ 1046   727   713   961   934  1094  1018  1338 40968  3079]
 [  826   477   491   896   390  1032   746   606  2257 44195]]
  accuracy valid:0.8252
[[872  14  12  14  10   7  17  35  27  25]
 [ 20 797  12  54  32  25  24  14  26  10]
 [  8  14 861  14  26  12  31  11  20  11]
 [ 13  28   6 832  10  12  14  17  33  20]
 [ 15  20  60  13 729  24  24  22  37  12]
 [ 13   7  11  10  15 832  17  14  28  15]
 [ 12  15  41  18  10  32 857  13  20  20]
 [ 38   7   5  10  22  21  21 808  27  16]
 [ 21  13  19   9  19  21  23  29 830  50]
 [ 11   9  13  22   9  28  12  15  42 834]]
  accuracy  test:0.8938
      A     B     C     D     E     F     G     H     I     J
A  1661    17    10    14    16    10    19    55    26    44
B    11  1651    12    62    25    27    28    14    27    16
C     4     4  1750     6    25    20    28     7    22     7
D    15    24     9  1726    10    23     9    10    25    22
E    11    54    74    12  1575    34    23    13    60    17
F    13     7    12     4     6  1750    11     6    24    39
G    17    18    60    13    11    40  1653    15    27    18
H    49    18    11    16    29    23    20  1648    38    20
I    24     8    12    19    28    30    25    22  1605    99
J    14     5    11    16     6    36     9     5    53  1717
         nFitObs                                                mdl  \
nFitObs                                                               
100          100  LogisticRegression(C=1.0, class_weight=None, d...   
1000        1000  LogisticRegression(C=1.0, class_weight=None, d...   
10000      10000  LogisticRegression(C=1.0, class_weight=None, d...   
100000    100000  LogisticRegression(C=1.0, class_weight=None, d...   
519114    519114  LogisticRegression(C=1.0, class_weight=None, d...   

         accuracy.fit  accuracy.vld  accuracy.new  
nFitObs                                            
100          1.000000        0.6829      0.749786  
1000         0.995000        0.7580      0.834223  
10000        0.898300        0.7892      0.860233  
100000       0.834390        0.8206      0.889126  
519114       0.829508        0.8252      0.893826  
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:11: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:13: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
In [192]:
plt.figure()
plt.plot(models['nFitObs'], models['accuracy.fit'], 'bo-', label = 'fit')
plt.plot(models['nFitObs'], models['accuracy.vld'], 'rs-', label = 'vld')
plt.plot(models['nFitObs'], models['accuracy.new'], 'gp-', label = 'new')
plt.legend()
plt.title("Accuracy")
plt.xscale('log')
axes = plt.gca()
axes.set_xlabel('nFitObs')
# axes.set_xlim([mdlDF['l1_penalty'][mdlDF['RSS.vld'].argmin()] / 10 ** 2, \
#                mdlDF['l1_penalty'][mdlDF['RSS.vld'].argmin()] * 10 ** 2])
# axes.set_ylim([0, mdlDF['RSS.vld'].min() * 1.5])
plt.show()
In [ ]:
 
In [ ]:
 
In [123]:
print dspLabels
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
In [154]:
import pandas as pd
[INFO] This non-commercial license of GraphLab Create is assigned to bbalaji8@gmail.com and will expire on December 09, 2016. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-18168 - Server binary: /usr/local/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1454417383.log
[INFO] GraphLab Server Version: 1.8.1
In [ ]: